Blog-Summarizer / main.py
enstazao
added lines for dubug
ae85af3
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
def fetch_webpage_content(url):
"""Fetch the content of a webpage."""
try:
response = requests.get(url, timeout=10) # Ensures the use of standard HTTP/HTTPS ports
response.raise_for_status() # Raises an error for bad responses
print("Hello", response.text)
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching the webpage: {e}")
return None
def parse_and_segment_content(html_content):
"""Parse and segment HTML content into manageable chunks."""
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
results = soup.find_all(['h1', 'p'])
text = ' '.join([result.text for result in results])
text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
sentences = text.split('<eos>')
print("Doing segmentation")
max_chunk = 500
chunks = []
current_chunk = -1
for sentence in sentences:
if len(sentence.strip()) == 0:
continue
if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk:
chunks.append([])
current_chunk += 1
chunks[current_chunk].extend(sentence.split())
chunks = [' '.join(chunk).strip() for chunk in chunks]
return chunks
def summarize_text(chunks):
"""Summarize the given text chunks."""
if not chunks:
return "No content to summarize."
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
summaries = []
print("Summarizing content")
for chunk in chunks:
try:
summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
summaries.append(summary[0]['summary_text'])
except Exception as e:
print(f"Error in summarization: {e}")
summaries.append("Error summarizing text.") # Keep the flow even if summarization fails
return ' '.join(summaries)
# Example usage
# url = "https://example.com"
# html_content = fetch_webpage_content(url)
# if html_content:
# chunks = parse_and_segment_content(html_content)
# summary = summarize_text(chunks)
# print(summary)
# else:
# print("Failed to fetch or parse webpage content.")