Spaces:
Runtime error
Runtime error
from transformers import pipeline | |
from bs4 import BeautifulSoup | |
import requests | |
def fetch_webpage_content(url): | |
"""Fetch the content of a webpage.""" | |
try: | |
response = requests.get(url, timeout=10) # Ensures the use of standard HTTP/HTTPS ports | |
response.raise_for_status() # Raises an error for bad responses | |
print("Hello", response.text) | |
return response.text | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching the webpage: {e}") | |
return None | |
def parse_and_segment_content(html_content): | |
"""Parse and segment HTML content into manageable chunks.""" | |
if not html_content: | |
return [] | |
soup = BeautifulSoup(html_content, 'html.parser') | |
results = soup.find_all(['h1', 'p']) | |
text = ' '.join([result.text for result in results]) | |
text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>') | |
sentences = text.split('<eos>') | |
print("Doing segmentation") | |
max_chunk = 500 | |
chunks = [] | |
current_chunk = -1 | |
for sentence in sentences: | |
if len(sentence.strip()) == 0: | |
continue | |
if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk: | |
chunks.append([]) | |
current_chunk += 1 | |
chunks[current_chunk].extend(sentence.split()) | |
chunks = [' '.join(chunk).strip() for chunk in chunks] | |
return chunks | |
def summarize_text(chunks): | |
"""Summarize the given text chunks.""" | |
if not chunks: | |
return "No content to summarize." | |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
summaries = [] | |
print("Summarizing content") | |
for chunk in chunks: | |
try: | |
summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False) | |
summaries.append(summary[0]['summary_text']) | |
except Exception as e: | |
print(f"Error in summarization: {e}") | |
summaries.append("Error summarizing text.") # Keep the flow even if summarization fails | |
return ' '.join(summaries) | |
# Example usage | |
# url = "https://example.com" | |
# html_content = fetch_webpage_content(url) | |
# if html_content: | |
# chunks = parse_and_segment_content(html_content) | |
# summary = summarize_text(chunks) | |
# print(summary) | |
# else: | |
# print("Failed to fetch or parse webpage content.") | |