Update main.py
Browse files
main.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# main.py (Revised:
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import logging
|
|
@@ -28,7 +28,7 @@ from telegram.error import NetworkError, RetryAfter, TimedOut # Import TimedOut
|
|
| 28 |
from telegram.request import HTTPXRequest # Import the request class
|
| 29 |
|
| 30 |
# --- Other Libraries ---
|
| 31 |
-
import httpx #
|
| 32 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 33 |
import requests
|
| 34 |
from bs4 import BeautifulSoup
|
|
@@ -62,7 +62,6 @@ ptb_app: Application | None = None
|
|
| 62 |
# --- Environment Variable Loading ---
|
| 63 |
logger.info("Attempting to load secrets...")
|
| 64 |
def get_secret(secret_name):
|
| 65 |
-
# logger.debug(f"Attempting to read secret: {secret_name}") # Optional: Less verbose startup
|
| 66 |
value = os.environ.get(secret_name)
|
| 67 |
if value: logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})")
|
| 68 |
else: logger.warning(f"Secret '{secret_name}': Not Found")
|
|
@@ -80,7 +79,7 @@ logger.info("Secret loading attempt finished.")
|
|
| 80 |
# (Keep ALL your functions: is_youtube_url, extract_youtube_id,
|
| 81 |
# get_transcript_via_supadata, get_transcript_via_apify,
|
| 82 |
# get_youtube_transcript, get_website_content_via_requests,
|
| 83 |
-
# get_website_content_via_urltotext_api, generate_summary)
|
| 84 |
|
| 85 |
# Helper Functions
|
| 86 |
def is_youtube_url(url):
|
|
@@ -112,7 +111,6 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
|
|
| 112 |
params = {"videoId": video_id, "format": "text"}
|
| 113 |
headers = {"X-API-Key": api_key}
|
| 114 |
try:
|
| 115 |
-
# Consider removing verify=False if possible, or manage certificates properly
|
| 116 |
logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification - Potential Security Risk)")
|
| 117 |
response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
|
| 118 |
logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
|
|
@@ -284,23 +282,21 @@ async def get_website_content_via_requests(url):
|
|
| 284 |
if not url: logger.error("[Web Scraper - Requests/BS4] get_website_content_via_requests called with no URL"); return None
|
| 285 |
logger.info(f"[Web Scraper - Requests/BS4] Attempting fetch: {url}")
|
| 286 |
try:
|
| 287 |
-
# Standard headers, avoid overly aggressive scraping patterns
|
| 288 |
headers = {
|
| 289 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
| 290 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
| 291 |
'Accept-Language': 'en-US,en;q=0.9',
|
| 292 |
'Connection': 'keep-alive',
|
| 293 |
-
'DNT': '1',
|
| 294 |
'Upgrade-Insecure-Requests': '1'
|
| 295 |
}
|
| 296 |
response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
|
| 297 |
-
response.raise_for_status()
|
| 298 |
logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
|
| 299 |
|
| 300 |
content_type = response.headers.get('content-type', '').lower()
|
| 301 |
if 'html' not in content_type:
|
| 302 |
logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received: {content_type}. Attempting plain text extraction.")
|
| 303 |
-
# Allow plain text only if explicitly text/plain
|
| 304 |
if 'text/plain' in content_type and response.text:
|
| 305 |
logger.info(f"[Web Scraper - Requests/BS4] Extracted plain text content. Length: {len(response.text.strip())}")
|
| 306 |
return response.text.strip()
|
|
@@ -308,39 +304,25 @@ async def get_website_content_via_requests(url):
|
|
| 308 |
return None
|
| 309 |
|
| 310 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 311 |
-
|
| 312 |
-
# Remove common non-content tags more aggressively
|
| 313 |
tags_to_remove = ["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio", "picture", "source"]
|
| 314 |
-
# Also remove elements often used for ads or menus by class/id
|
| 315 |
selectors_to_remove = ['.ad', '#ad', '.ads', '#ads', '.advertisement', '#advertisement', '.banner', '#banner', '.menu', '#menu', '.navigation', '#navigation', '.sidebar', '#sidebar', '.social', '#social', '.share', '#share', '.related', '#related', '.comments', '#comments', '.cookie-consent', '#cookie-consent']
|
| 316 |
-
|
| 317 |
for tag in soup(tags_to_remove): tag.decompose()
|
| 318 |
for selector in selectors_to_remove:
|
| 319 |
for element in soup.select(selector): element.decompose()
|
| 320 |
|
| 321 |
-
|
| 322 |
-
main_content = soup.find('main') or \
|
| 323 |
-
soup.find('article') or \
|
| 324 |
-
soup.find(id='content') or \
|
| 325 |
-
soup.find(class_='content') or \
|
| 326 |
-
soup.find(id='main-content') or \
|
| 327 |
-
soup.find(class_='main-content') or \
|
| 328 |
-
soup.find(role='main')
|
| 329 |
-
|
| 330 |
target_element = main_content if main_content else soup.body
|
| 331 |
if not target_element:
|
| 332 |
logger.warning(f"[Web Scraper - Requests/BS4] Could not find a suitable target element (main, article, body) for {url}");
|
| 333 |
return None
|
| 334 |
|
| 335 |
-
# Extract text, attempting to preserve paragraphs better
|
| 336 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
| 337 |
-
text = "\n\n".join(lines)
|
| 338 |
|
| 339 |
-
MIN_TEXT_LENGTH = 100
|
| 340 |
if not text or len(text) < MIN_TEXT_LENGTH:
|
| 341 |
logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is too short (<{MIN_TEXT_LENGTH} chars) after cleaning for {url}. Length: {len(text)}. Content might be JS-rendered or blocked.")
|
| 342 |
-
|
| 343 |
-
return None # Treat very short text as failure
|
| 344 |
|
| 345 |
logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped and cleaned content from {url}. Final Length: {len(text)}")
|
| 346 |
return text
|
|
@@ -358,18 +340,17 @@ async def get_website_content_via_urltotext_api(url: str, api_key: str):
|
|
| 358 |
if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
|
| 359 |
logger.info(f"[Web Scraper - URLToText API] Attempting fetch via API: {url}")
|
| 360 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
| 361 |
-
# Ensure payload includes options beneficial for scraping modern sites
|
| 362 |
payload = json.dumps({
|
| 363 |
"url": url,
|
| 364 |
"output_format": "text",
|
| 365 |
-
"extract_main_content": True,
|
| 366 |
-
"render_javascript": True,
|
| 367 |
-
"residential_proxy": False,
|
| 368 |
-
"timeout_render": 20000,
|
| 369 |
})
|
| 370 |
headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
|
| 371 |
try:
|
| 372 |
-
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=60)
|
| 373 |
logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
|
| 374 |
if response.status_code == 200:
|
| 375 |
try:
|
|
@@ -378,10 +359,10 @@ async def get_website_content_via_urltotext_api(url: str, api_key: str):
|
|
| 378 |
content = content_data.get("content")
|
| 379 |
credits = data.get("credits_used", "N/A")
|
| 380 |
warning = content_data.get("warning")
|
| 381 |
-
error_msg = content_data.get("error")
|
| 382 |
|
| 383 |
if warning: logger.warning(f"[Web Scraper - URLToText API] API Warning for {url}: {warning}")
|
| 384 |
-
if error_msg: logger.error(f"[Web Scraper - URLToText API] API Error reported for {url}: {error_msg}"); return None
|
| 385 |
|
| 386 |
if content and isinstance(content, str):
|
| 387 |
logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API. Length: {len(content.strip())}. Credits Used: {credits}");
|
|
@@ -394,11 +375,11 @@ async def get_website_content_via_urltotext_api(url: str, api_key: str):
|
|
| 394 |
elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400) to API. Check payload/URL. Response: {response.text[:200]}...")
|
| 395 |
elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401). Check API Key. Response: {response.text[:200]}...")
|
| 396 |
elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402). Check API credits/plan. Response: {response.text[:200]}...")
|
| 397 |
-
elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL / Fetch Error (422) reported by API for {url}. Response: {response.text[:200]}...")
|
| 398 |
elif response.status_code == 429: logger.warning(f"[Web Scraper - URLToText API] Rate Limit Hit (429). Response: {response.text[:200]}...")
|
| 399 |
elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] API Server Error ({response.status_code}). Response: {response.text[:200]}...")
|
| 400 |
else: logger.error(f"[Web Scraper - URLToText API] Unexpected status code {response.status_code} from API. Response: {response.text[:200]}...")
|
| 401 |
-
return None
|
| 402 |
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout connecting to API for {url}"); return None
|
| 403 |
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API: {e}"); return None
|
| 404 |
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call: {e}", exc_info=True); return None
|
|
@@ -411,11 +392,8 @@ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
|
| 411 |
if not text or not text.strip(): logger.warning("generate_summary called with empty or whitespace-only text."); return "Error: No content was provided to summarize."
|
| 412 |
|
| 413 |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
| 414 |
-
# Consider using a non-free model if rate limits are hit or quality needed
|
| 415 |
model_name = "deepseek/deepseek-chat:free"
|
| 416 |
-
# model_name = "openai/gpt-3.5-turbo" # Example alternative
|
| 417 |
|
| 418 |
-
# --- UPDATED PROMPTS ---
|
| 419 |
if summary_type == "paragraph":
|
| 420 |
system_message = (
|
| 421 |
"You are an expert summarization AI. Your goal is to provide a concise, easy-to-understand summary of the provided text. "
|
|
@@ -449,12 +427,8 @@ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
|
| 449 |
else:
|
| 450 |
logger.error(f"Invalid summary_type '{summary_type}' requested.")
|
| 451 |
return f"Error: Invalid summary type ('{summary_type}') requested. Please choose 'paragraph' or 'points'."
|
| 452 |
-
# --- END UPDATED PROMPTS ---
|
| 453 |
|
| 454 |
-
|
| 455 |
-
# Deepseek context might be larger, but set a reasonable app limit
|
| 456 |
-
MAX_INPUT_TOKENS_ESTIMATE = 28000 # Rough estimate for deepseek-chat's context limit (aim lower than max)
|
| 457 |
-
# Simple character length heuristic (adjust based on typical content)
|
| 458 |
AVG_CHARS_PER_TOKEN = 4
|
| 459 |
MAX_INPUT_LENGTH = MAX_INPUT_TOKENS_ESTIMATE * AVG_CHARS_PER_TOKEN
|
| 460 |
|
|
@@ -463,33 +437,29 @@ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
|
| 463 |
truncation_marker = "\n\n[... Text truncated due to length ...]"
|
| 464 |
text = text[:MAX_INPUT_LENGTH - len(truncation_marker)] + truncation_marker
|
| 465 |
|
| 466 |
-
# Construct the messages payload for the API
|
| 467 |
messages = [
|
| 468 |
{"role": "system", "content": system_message},
|
| 469 |
{"role": "user", "content": f"{user_prompt_instruction}\n\n--- TEXT TO SUMMARIZE ---\n\n{text}\n\n--- END OF TEXT ---"}
|
| 470 |
]
|
| 471 |
|
| 472 |
-
|
| 473 |
-
space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME") # Replace default if needed
|
| 474 |
referer_url = f"https://{space_host}" if space_host and not space_host.startswith("http") else space_host or "https://huggingface.co"
|
| 475 |
headers = {
|
| 476 |
"Authorization": f"Bearer {api_key}",
|
| 477 |
"Content-Type": "application/json",
|
| 478 |
"HTTP-Referer": referer_url,
|
| 479 |
-
"X-Title": "Telegram URL Summarizer Bot"
|
| 480 |
}
|
| 481 |
payload = json.dumps({"model": model_name, "messages": messages})
|
| 482 |
|
| 483 |
try:
|
| 484 |
logger.debug(f"Sending request to OpenRouter (Model: {model_name}). Prompt length approx: {len(text)} chars.")
|
| 485 |
-
# Increased timeout for potentially long AI generation
|
| 486 |
response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=120)
|
| 487 |
logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
| 488 |
|
| 489 |
if response.status_code == 200:
|
| 490 |
try:
|
| 491 |
data = response.json()
|
| 492 |
-
# Check for response structure variations
|
| 493 |
choice = data.get("choices", [{}])[0]
|
| 494 |
message = choice.get("message", {})
|
| 495 |
summary = message.get("content")
|
|
@@ -498,8 +468,7 @@ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
|
| 498 |
if summary and isinstance(summary, str) and summary.strip():
|
| 499 |
summary = summary.strip()
|
| 500 |
logger.info(f"Successfully generated summary. Finish Reason: {finish_reason}. Length: {len(summary)}")
|
| 501 |
-
|
| 502 |
-
if summary_type == "paragraph" and len(summary.split()) > 95: # Allow slight overrun from 85 words
|
| 503 |
logger.warning(f"Generated paragraph summary slightly longer than target word count ({len(summary.split())} words).")
|
| 504 |
return summary
|
| 505 |
else:
|
|
@@ -513,16 +482,14 @@ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
|
| 513 |
logger.error(f"Unexpected error processing OpenRouter success response: {e}", exc_info=True)
|
| 514 |
return "Sorry, an unexpected error occurred while processing the AI response."
|
| 515 |
|
| 516 |
-
# Handle specific HTTP error codes from OpenRouter
|
| 517 |
elif response.status_code == 401: logger.error("OpenRouter API key is invalid (Unauthorized - 401)."); return "Error: AI service authentication failed. Please check the configuration."
|
| 518 |
elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check credits/limits."); return "Sorry, there's an issue with the AI service account limits or payment."
|
| 519 |
elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Hit (429)."); return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
|
| 520 |
elif response.status_code == 400: logger.error(f"OpenRouter Bad Request (400). Likely prompt issue. Response: {response.text[:500]}..."); return "Sorry, the request to the AI service was invalid (possibly due to the content or prompt)."
|
| 521 |
elif response.status_code >= 500: logger.error(f"OpenRouter Server Error ({response.status_code}). Response: {response.text[:500]}..."); return "Sorry, the AI service is experiencing internal issues. Please try again later."
|
| 522 |
else:
|
| 523 |
-
# Handle other unexpected errors
|
| 524 |
logger.error(f"Unexpected HTTP status {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
|
| 525 |
-
try:
|
| 526 |
error_data = response.json()
|
| 527 |
error_msg = error_data.get("error", {}).get("message", response.text[:100])
|
| 528 |
return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
|
|
@@ -539,9 +506,8 @@ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
|
| 539 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 540 |
"""Handles the /start command."""
|
| 541 |
user = update.effective_user
|
| 542 |
-
if not user: return
|
| 543 |
logger.info(f"User {user.id} ({user.username or 'NoUsername'}) initiated /start.")
|
| 544 |
-
# Use mention_html for linking username if available, otherwise just first name
|
| 545 |
mention = user.mention_html() if user.username else user.first_name
|
| 546 |
start_message = (
|
| 547 |
f"👋 Hello {mention}!\n\n"
|
|
@@ -568,7 +534,6 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
| 568 |
"- **Length:** Very long articles or videos might be truncated before summarization to fit within processing limits.\n\n"
|
| 569 |
"Just send a link to get started!"
|
| 570 |
)
|
| 571 |
-
# Use MarkdownV2 for better formatting control if needed, but MARKDOWN is simpler
|
| 572 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
| 573 |
|
| 574 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
@@ -576,18 +541,14 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
| 576 |
if not update.message or not update.message.text: return
|
| 577 |
message_text = update.message.text.strip()
|
| 578 |
user = update.effective_user
|
| 579 |
-
if not user: return
|
| 580 |
|
| 581 |
-
# More robust URL regex (handles various protocols, domains, paths, queries)
|
| 582 |
-
# Still simple, not aiming for perfect RFC 3986 validation
|
| 583 |
url_pattern = r'https?://(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(?:/[^\s]*)?'
|
| 584 |
match = re.search(url_pattern, message_text)
|
| 585 |
|
| 586 |
if match:
|
| 587 |
url = match.group(0)
|
| 588 |
logger.info(f"User {user.id} sent potential URL: {url}")
|
| 589 |
-
|
| 590 |
-
# Store URL in user_data, associated with the user ID
|
| 591 |
context.user_data['url_to_summarize'] = url
|
| 592 |
logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
|
| 593 |
|
|
@@ -598,18 +559,13 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
| 598 |
]
|
| 599 |
]
|
| 600 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
| 601 |
-
|
| 602 |
-
# Send message asking for summary type
|
| 603 |
await update.message.reply_text(
|
| 604 |
f"✅ Link received:\n`{url}`\n\nChoose your desired summary format:",
|
| 605 |
reply_markup=reply_markup,
|
| 606 |
parse_mode=ParseMode.MARKDOWN,
|
| 607 |
-
link_preview_options={'is_disabled': True}
|
| 608 |
)
|
| 609 |
else:
|
| 610 |
-
# If it doesn't look like a URL, maybe provide guidance?
|
| 611 |
-
# logger.debug(f"Ignoring non-URL message from {user.id}: {message_text[:100]}")
|
| 612 |
-
# Optional: Reply if it's not a command and not a URL
|
| 613 |
if not message_text.startswith('/'):
|
| 614 |
await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
|
| 615 |
|
|
@@ -619,53 +575,42 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
| 619 |
query = update.callback_query
|
| 620 |
if not query or not query.from_user:
|
| 621 |
logger.warning("Callback query or user missing in update.")
|
| 622 |
-
return
|
| 623 |
user = query.from_user
|
| 624 |
|
| 625 |
-
# --- Answer Callback Query Immediately ---
|
| 626 |
try:
|
| 627 |
-
await query.answer()
|
| 628 |
logger.debug(f"Answered callback query {query.id} for user {user.id}")
|
| 629 |
except TimedOut:
|
| 630 |
-
# Log timeout but proceed; the button loading indicator might just hang for the user
|
| 631 |
logger.warning(f"Timeout answering callback query {query.id} for user {user.id}. Processing continues.")
|
| 632 |
except Exception as e:
|
| 633 |
-
# Log other errors but proceed cautiously. The button might remain "loading".
|
| 634 |
logger.error(f"Error answering callback query {query.id} for user {user.id}: {e!r}", exc_info=True)
|
| 635 |
|
| 636 |
-
summary_type = query.data
|
| 637 |
-
# Retrieve URL stored earlier for this user
|
| 638 |
url = context.user_data.get('url_to_summarize')
|
| 639 |
logger.info(f"User {user.id} chose summary type '{summary_type}'. Checking for stored URL.")
|
| 640 |
|
| 641 |
if not url:
|
| 642 |
logger.warning(f"User {user.id} pressed button '{summary_type}', but NO URL found in user_data context.")
|
| 643 |
try:
|
| 644 |
-
# Inform user context was lost (e.g., bot restarted, long delay)
|
| 645 |
await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
|
| 646 |
except TimedOut:
|
| 647 |
logger.error(f"Timeout trying to edit message to inform user {user.id} about lost context.")
|
| 648 |
except Exception as edit_err:
|
| 649 |
-
# Log error if editing fails (message might already be gone, or other Telegram issue)
|
| 650 |
logger.error(f"Failed to edit message for lost context for user {user.id}: {edit_err}")
|
| 651 |
-
return
|
| 652 |
|
| 653 |
-
# --- URL Found - Proceed with Processing ---
|
| 654 |
logger.info(f"Processing URL '{url}' for user {user.id} with type '{summary_type}'.")
|
| 655 |
-
# Clear the URL from context now that we're processing it
|
| 656 |
context.user_data.pop('url_to_summarize', None)
|
| 657 |
logger.debug(f"Cleared URL from user_data for user {user.id}")
|
| 658 |
|
| 659 |
-
# Fetch current API keys (allows for potential runtime changes, though unlikely here)
|
| 660 |
current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
|
| 661 |
current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
|
| 662 |
current_supadata_key = os.environ.get('SUPADATA_API_KEY')
|
| 663 |
current_apify_token = os.environ.get('APIFY_API_TOKEN')
|
| 664 |
-
# Simple check log
|
| 665 |
keys_present = f"OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}"
|
| 666 |
logger.debug(f"API Key check for user {user.id} request: {keys_present}")
|
| 667 |
|
| 668 |
-
# Critical dependency check: AI key
|
| 669 |
if not current_openrouter_key:
|
| 670 |
logger.error(f"CRITICAL: OpenRouter API key is missing. Cannot generate summary for user {user.id}.")
|
| 671 |
try:
|
|
@@ -676,47 +621,39 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
| 676 |
logger.error(f"Failed to edit message for missing AI key for user {user.id}: {edit_err}")
|
| 677 |
return
|
| 678 |
|
| 679 |
-
# --- Inform User Processing Has Started ---
|
| 680 |
processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
|
| 681 |
-
message_to_edit = query.message
|
| 682 |
-
status_message_sent = None
|
| 683 |
|
| 684 |
try:
|
| 685 |
if message_to_edit:
|
| 686 |
await query.edit_message_text(text=processing_message_text)
|
| 687 |
logger.debug(f"Edited original message {message_to_edit.message_id} to show 'Working...' status for query {query.id}")
|
| 688 |
else:
|
| 689 |
-
# This case should be rare if query.message exists, but handle defensively
|
| 690 |
logger.warning(f"Original message (query.message) not found for query {query.id}. Cannot edit, will send new status message.")
|
| 691 |
-
raise ValueError("Original message object missing")
|
| 692 |
except (TimedOut, Exception) as e:
|
| 693 |
-
# If editing fails (e.g., message too old, deleted, rate limit), try sending a new message
|
| 694 |
logger.warning(f"Could not edit original message {message_to_edit.message_id if message_to_edit else 'N/A'} for query {query.id}: {e!r}. Attempting to send a new status message.")
|
| 695 |
-
message_to_edit = None
|
| 696 |
try:
|
| 697 |
status_message_sent = await context.bot.send_message(chat_id=user.id, text=processing_message_text)
|
| 698 |
logger.debug(f"Sent new status message {status_message_sent.message_id} to user {user.id}.")
|
| 699 |
except TimedOut:
|
| 700 |
logger.error(f"Timeout sending NEW 'Working...' status message to user {user.id}. Processing continues without feedback.")
|
| 701 |
-
# User won't know bot is working - proceed anyway, hope for the best.
|
| 702 |
except Exception as send_err:
|
| 703 |
logger.error(f"Failed sending NEW 'Working...' status message to user {user.id}: {send_err}. Processing continues without feedback.")
|
| 704 |
-
# As above.
|
| 705 |
|
| 706 |
-
# --- Main Content Fetching and Summarization ---
|
| 707 |
content = None
|
| 708 |
-
user_feedback_message = None
|
| 709 |
-
success = False
|
| 710 |
|
| 711 |
try:
|
| 712 |
-
# Send 'typing' action to indicate activity
|
| 713 |
try:
|
| 714 |
logger.debug(f"Sending 'typing' chat action to chat {user.id}")
|
| 715 |
await context.bot.send_chat_action(chat_id=user.id, action='typing')
|
| 716 |
except TimedOut: logger.warning(f"Timeout sending 'typing' action for user {user.id}.")
|
| 717 |
except Exception as ca_err: logger.warning(f"Failed sending 'typing' action for user {user.id}: {ca_err}")
|
| 718 |
|
| 719 |
-
# --- Determine Content Type and Fetch ---
|
| 720 |
is_yt = is_youtube_url(url)
|
| 721 |
logger.debug(f"URL ({url}) is YouTube: {is_yt} (User: {user.id})")
|
| 722 |
|
|
@@ -734,18 +671,15 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
| 734 |
logger.warning(f"Failed to extract YouTube video ID from URL: {url} (User: {user.id})")
|
| 735 |
user_feedback_message = "⚠️ Sorry, I couldn't identify a valid YouTube video ID in the link you provided."
|
| 736 |
else:
|
| 737 |
-
# --- Website Scraping ---
|
| 738 |
logger.info(f"Attempting website scrape (Requests/BS4) for URL: {url} (User: {user.id})")
|
| 739 |
content = await get_website_content_via_requests(url)
|
| 740 |
if content:
|
| 741 |
logger.info(f"Website scrape successful (Requests/BS4). Length: {len(content)} (User: {user.id})")
|
| 742 |
-
# Content found, no need for feedback message yet
|
| 743 |
else:
|
| 744 |
logger.warning(f"Primary website scrape failed for {url} (User: {user.id}). Trying fallback API.")
|
| 745 |
if current_urltotext_key:
|
| 746 |
-
# Send typing again if first scrape failed and we try another method
|
| 747 |
try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before fallback scrape.")
|
| 748 |
-
except: pass
|
| 749 |
|
| 750 |
logger.info(f"Attempting website scrape via URLToText API for: {url} (User: {user.id})")
|
| 751 |
content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
|
|
@@ -755,51 +689,44 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
| 755 |
logger.warning(f"Fallback website scrape (URLToText API) also failed for {url} (User: {user.id}).")
|
| 756 |
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website using available methods. It might be protected or structured in a way I can't parse."
|
| 757 |
else:
|
| 758 |
-
# Fallback key missing
|
| 759 |
logger.warning(f"Primary scrape failed and URLToText API key not configured. Cannot fallback for {url} (User: {user.id}).")
|
| 760 |
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website, and the fallback service isn't configured."
|
| 761 |
|
| 762 |
-
# --- Generate Summary if Content Was Fetched ---
|
| 763 |
if content:
|
| 764 |
logger.info(f"Content fetched (Length: {len(content)}). Generating '{summary_type}' summary for user {user.id}.")
|
| 765 |
-
# Send typing before potentially long AI call
|
| 766 |
try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before AI summary generation.")
|
| 767 |
except: pass
|
| 768 |
|
| 769 |
summary = await generate_summary(content, summary_type, current_openrouter_key)
|
| 770 |
|
| 771 |
-
# Check if summary generation returned an error message
|
| 772 |
if summary.startswith("Error:") or summary.startswith("Sorry,"):
|
| 773 |
logger.warning(f"AI summary generation failed for user {user.id}. Reason: {summary}")
|
| 774 |
-
user_feedback_message = f"⚠️ {summary}"
|
| 775 |
else:
|
| 776 |
-
# --- Summary Success - Send to User ---
|
| 777 |
logger.info(f"Summary generated successfully for user {user.id}. Length: {len(summary)}. Sending result.")
|
| 778 |
try:
|
| 779 |
await context.bot.send_message(
|
| 780 |
chat_id=user.id,
|
| 781 |
text=summary,
|
| 782 |
-
parse_mode=ParseMode.MARKDOWN,
|
| 783 |
link_preview_options={'is_disabled': True}
|
| 784 |
)
|
| 785 |
success = True
|
| 786 |
-
user_feedback_message = None
|
| 787 |
logger.info(f"Successfully sent summary to user {user.id}.")
|
| 788 |
except TimedOut:
|
| 789 |
logger.error(f"Timeout sending final summary message to user {user.id}.")
|
| 790 |
user_feedback_message = "⚠️ Sorry, there was a timeout while trying to send you the final summary."
|
| 791 |
-
success = False
|
| 792 |
except Exception as send_final_err:
|
| 793 |
logger.error(f"Failed sending final summary to user {user.id}: {send_final_err}", exc_info=True)
|
| 794 |
user_feedback_message = "⚠️ Sorry, an unexpected error occurred while sending the final summary."
|
| 795 |
-
success = False
|
| 796 |
|
| 797 |
elif not user_feedback_message:
|
| 798 |
-
# If content is None, but no specific error message was set above, set a generic one.
|
| 799 |
logger.warning(f"Content retrieval resulted in None, but no specific user feedback message was set. URL: {url} (User: {user.id})")
|
| 800 |
user_feedback_message = "⚠️ Sorry, I couldn't retrieve any usable content from the link provided."
|
| 801 |
|
| 802 |
-
# --- Send Final Feedback Message if Processing Failed ---
|
| 803 |
if user_feedback_message and not success:
|
| 804 |
logger.warning(f"Processing failed or summary sending failed for user {user.id}. Sending feedback: {user_feedback_message}")
|
| 805 |
try:
|
|
@@ -810,114 +737,79 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
| 810 |
logger.error(f"Failed sending final FAILURE feedback message to user {user.id}: {send_feedback_err}")
|
| 811 |
|
| 812 |
except Exception as e:
|
| 813 |
-
# Catch-all for unexpected errors during the main processing block
|
| 814 |
logger.error(f"Unexpected critical error during callback processing for user {user.id}, URL {url}: {e}", exc_info=True)
|
| 815 |
try:
|
| 816 |
-
# Send a generic error message to the user
|
| 817 |
await context.bot.send_message(chat_id=user.id, text="❌ Oops! An unexpected internal error occurred while processing your request. The issue has been logged.")
|
| 818 |
except TimedOut:
|
| 819 |
logger.error(f"Timeout sending CRITICAL internal error feedback message to user {user.id}.")
|
| 820 |
except Exception as final_err:
|
| 821 |
-
# If even sending the error message fails, log it.
|
| 822 |
logger.error(f"Failed sending CRITICAL internal error feedback message to user {user.id}: {final_err}")
|
| 823 |
-
# Ensure success is False if we hit this block
|
| 824 |
success = False
|
| 825 |
|
| 826 |
finally:
|
| 827 |
-
# --- Clean up Status Message(s) ---
|
| 828 |
logger.debug(f"Cleaning up status message(s) for user {user.id}, query {query.id}. Success={success}")
|
| 829 |
try:
|
| 830 |
if status_message_sent:
|
| 831 |
-
# If we sent a separate "Working..." message, delete it regardless of success/failure
|
| 832 |
-
# as the final result or error message has been (or attempted to be) sent.
|
| 833 |
await context.bot.delete_message(chat_id=user.id, message_id=status_message_sent.message_id)
|
| 834 |
logger.debug(f"Deleted separate status message {status_message_sent.message_id} for user {user.id}.")
|
| 835 |
elif message_to_edit:
|
| 836 |
-
# If we edited the original message with the buttons...
|
| 837 |
if success:
|
| 838 |
-
# If processing succeeded, delete the "Working..." message.
|
| 839 |
await query.delete_message()
|
| 840 |
logger.debug(f"Processing succeeded. Deleted original (edited) message {message_to_edit.message_id} for query {query.id}.")
|
| 841 |
else:
|
| 842 |
-
# If processing failed, *don't* delete the message.
|
| 843 |
-
# It either still shows "Working..." (if sending final error failed)
|
| 844 |
-
# or it might show an error message if edit_message_text was used for that.
|
| 845 |
-
# Let's try to edit it one last time to show a generic failure if no specific feedback was sent.
|
| 846 |
-
# This is complex, maybe just leave it as is for simplicity.
|
| 847 |
logger.debug(f"Processing failed. Leaving edited message {message_to_edit.message_id} in place for query {query.id}.")
|
| 848 |
-
# Optional: Try one last edit to show failure if needed, but might be overkill
|
| 849 |
-
# if not user_feedback_message: # Only if no other error was sent
|
| 850 |
-
# try: await query.edit_message_text("❌ Processing failed.")
|
| 851 |
-
# except: pass # Ignore errors here
|
| 852 |
-
|
| 853 |
-
# If message_to_edit was None (original edit failed) and status_message_sent was None (sending new status failed), there's nothing to delete here.
|
| 854 |
|
| 855 |
except TimedOut:
|
| 856 |
logger.warning(f"Timeout attempting to delete status/button message for user {user.id}, query {query.id}.")
|
| 857 |
except Exception as del_e:
|
| 858 |
-
# Log deletion errors as warnings, not critical if cleanup fails.
|
| 859 |
-
# Common error: message already deleted or trying to delete too late.
|
| 860 |
logger.warning(f"Could not delete status/button message for user {user.id}, query {query.id}: {del_e!r}")
|
| 861 |
|
| 862 |
-
# Log the completion of the callback handling
|
| 863 |
logger.info(f"Finished handling callback query {query.id} for user {user.id}. Overall Success: {success}")
|
| 864 |
|
| 865 |
|
| 866 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 867 |
"""Log Errors caused by Updates."""
|
| 868 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
| 869 |
-
# Add specific error type handling if needed (e.g., NetworkError, TimedOut)
|
| 870 |
if isinstance(context.error, TimedOut):
|
| 871 |
logger.warning("A timeout error occurred in PTB communication.")
|
| 872 |
elif isinstance(context.error, NetworkError):
|
| 873 |
logger.warning(f"A network error occurred: {context.error}")
|
| 874 |
-
# Consider notifying admin or user for specific critical errors if appropriate
|
| 875 |
|
| 876 |
-
# --- Bot Setup Function (
|
| 877 |
async def setup_bot_config() -> Application:
|
| 878 |
-
"""Configures the PTB Application with custom HTTPX settings."""
|
| 879 |
logger.info("Configuring Telegram Application...")
|
| 880 |
if not TELEGRAM_TOKEN:
|
| 881 |
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
|
| 882 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
| 883 |
|
| 884 |
-
# --- Configure HTTPX client settings ---
|
| 885 |
connect_timeout = 10.0 # Slightly higher connect timeout
|
| 886 |
-
# --- INCREASED TIMEOUTS AND POOL SIZE ---
|
| 887 |
read_timeout = 30.0 # Increased timeout for reading response
|
| 888 |
write_timeout = 30.0 # Increased timeout for sending request
|
| 889 |
pool_timeout = 30.0 # Increased timeout for getting connection from pool
|
| 890 |
-
|
|
|
|
| 891 |
|
| 892 |
-
logger.info(f"Creating PTB HTTPXRequest with settings: "
|
| 893 |
f"connect_timeout={connect_timeout}, read_timeout={read_timeout}, "
|
| 894 |
-
f"write_timeout={write_timeout}, pool_timeout={pool_timeout}
|
| 895 |
-
f"
|
| 896 |
-
|
| 897 |
-
# Create httpx.Limits object
|
| 898 |
-
custom_limits = httpx.Limits(
|
| 899 |
-
max_connections=connection_pool_size,
|
| 900 |
-
max_keepalive_connections=connection_pool_size # Keepalive same as max
|
| 901 |
-
# keepalive_expiry=60.0 # Optional: Keep idle connections open longer (seconds)
|
| 902 |
-
)
|
| 903 |
|
| 904 |
-
# Create a custom request object with
|
| 905 |
custom_request = HTTPXRequest(
|
| 906 |
connect_timeout=connect_timeout,
|
| 907 |
read_timeout=read_timeout,
|
| 908 |
write_timeout=write_timeout,
|
| 909 |
pool_timeout=pool_timeout,
|
| 910 |
-
limits=custom_limits,
|
| 911 |
-
http_version="1.1"
|
| 912 |
)
|
| 913 |
|
| 914 |
# Use Application.builder() and pass the custom request object
|
| 915 |
application_builder = Application.builder().token(TELEGRAM_TOKEN)
|
| 916 |
application_builder.request(custom_request)
|
| 917 |
-
#
|
| 918 |
-
# application_builder.get_updates_request(custom_request)
|
| 919 |
-
# Apply connection pool settings globally if needed (less common now with direct request object)
|
| 920 |
-
# application_builder.pool_timeout(pool_timeout) # This might be redundant if set in HTTPXRequest
|
| 921 |
|
| 922 |
# Build the application instance
|
| 923 |
application = application_builder.build()
|
|
@@ -925,11 +817,8 @@ async def setup_bot_config() -> Application:
|
|
| 925 |
# --- Register Handlers ---
|
| 926 |
application.add_handler(CommandHandler("start", start))
|
| 927 |
application.add_handler(CommandHandler("help", help_command))
|
| 928 |
-
# Handles non-command text messages that might contain a URL
|
| 929 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
| 930 |
-
# Handles the button clicks ('paragraph' or 'points')
|
| 931 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
| 932 |
-
# Global error handler
|
| 933 |
application.add_error_handler(error_handler)
|
| 934 |
|
| 935 |
logger.info("Telegram application handlers configured.")
|
|
@@ -941,51 +830,38 @@ async def lifespan(app: Starlette):
|
|
| 941 |
"""Handles PTB startup and shutdown during ASGI lifespan."""
|
| 942 |
global ptb_app
|
| 943 |
logger.info("ASGI Lifespan: Startup sequence initiated...")
|
| 944 |
-
# loop = asyncio.get_running_loop() # Not usually needed directly
|
| 945 |
|
| 946 |
try:
|
| 947 |
-
# --- Setup and Initialize PTB Application ---
|
| 948 |
ptb_app = await setup_bot_config()
|
| 949 |
logger.info("PTB Application object configured. Initializing...")
|
| 950 |
-
await ptb_app.initialize()
|
| 951 |
logger.info("PTB Application initialized. Starting background tasks (e.g., job queue)...")
|
| 952 |
-
# Start PTB's internal tasks but not polling (we use webhook)
|
| 953 |
await ptb_app.start()
|
| 954 |
-
if ptb_app.updater: ptb_app.updater.stop() # Ensure polling is stopped
|
| 955 |
bot_instance = ptb_app.bot
|
| 956 |
bot_info = await bot_instance.get_me()
|
| 957 |
logger.info(f"PTB Application started successfully. Bot ID: {bot_info.id}, Username: @{bot_info.username}")
|
| 958 |
|
| 959 |
-
# --- Set Webhook ---
|
| 960 |
-
# Ensure SPACE_HOST is correctly set in Hugging Face Space secrets
|
| 961 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
| 962 |
if WEBHOOK_URL_BASE:
|
| 963 |
-
# Ensure it's a proper HTTPS URL
|
| 964 |
if not WEBHOOK_URL_BASE.startswith("https://"): WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
|
| 965 |
-
webhook_path = "/webhook"
|
| 966 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
| 967 |
|
| 968 |
logger.info(f"Attempting to set Telegram webhook to: {full_webhook_url}")
|
| 969 |
-
# Short delay can sometimes help prevent race conditions on startup
|
| 970 |
await asyncio.sleep(2.0)
|
| 971 |
try:
|
| 972 |
-
# Set the webhook, specifying allowed updates can reduce load
|
| 973 |
await bot_instance.set_webhook(
|
| 974 |
url=full_webhook_url,
|
| 975 |
-
allowed_updates=Update.ALL_TYPES,
|
| 976 |
-
# secret_token="YOUR_SECRET_TOKEN" # Recommended for security if possible
|
| 977 |
-
# drop_pending_updates=True # Optional: Ignore updates sent while bot was down
|
| 978 |
)
|
| 979 |
-
# Verify webhook setup
|
| 980 |
webhook_info = await bot_instance.get_webhook_info()
|
| 981 |
if webhook_info.url == full_webhook_url:
|
| 982 |
logger.info(f"Telegram webhook set successfully! Current info: {webhook_info}")
|
| 983 |
else:
|
| 984 |
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got: {webhook_info.url}. Info: {webhook_info}")
|
| 985 |
except RetryAfter as e:
|
| 986 |
-
# This can happen if multiple workers try to set the webhook simultaneously
|
| 987 |
logger.warning(f"Webhook setting throttled by Telegram (RetryAfter: {e.retry_after}s). Another instance likely succeeded or try again later.")
|
| 988 |
-
# Optionally check info again after delay
|
| 989 |
await asyncio.sleep(e.retry_after or 2)
|
| 990 |
webhook_info = await bot_instance.get_webhook_info()
|
| 991 |
logger.info(f"Webhook info after RetryAfter delay: {webhook_info}")
|
|
@@ -999,29 +875,24 @@ async def lifespan(app: Starlette):
|
|
| 999 |
|
| 1000 |
except Exception as startup_err:
|
| 1001 |
logger.critical(f"CRITICAL ERROR during ASGI application startup: {startup_err}", exc_info=True)
|
| 1002 |
-
# Re-raise the exception to potentially stop the ASGI server from starting improperly
|
| 1003 |
raise
|
| 1004 |
finally:
|
| 1005 |
-
# --- Shutdown Sequence ---
|
| 1006 |
logger.info("ASGI Lifespan: Shutdown sequence initiated...")
|
| 1007 |
if ptb_app:
|
| 1008 |
bot_username = ptb_app.bot.username if ptb_app.bot else "N/A"
|
| 1009 |
logger.info(f"PTB App instance found for @{bot_username}. Checking if running...")
|
| 1010 |
-
# Check internal state if available (e.g., ptb_app.running might exist in future versions)
|
| 1011 |
-
# Using _running is internal, but often the only way
|
| 1012 |
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
| 1013 |
if is_running:
|
| 1014 |
try:
|
| 1015 |
logger.info("Stopping PTB Application's background tasks...")
|
| 1016 |
-
await ptb_app.stop()
|
| 1017 |
logger.info("Shutting down PTB Application connections and resources...")
|
| 1018 |
-
await ptb_app.shutdown()
|
| 1019 |
logger.info("PTB Application shut down gracefully.")
|
| 1020 |
except Exception as shutdown_err:
|
| 1021 |
logger.error(f"Error during PTB Application shutdown: {shutdown_err}", exc_info=True)
|
| 1022 |
else:
|
| 1023 |
logger.warning("PTB Application instance exists but was not marked as running at shutdown.")
|
| 1024 |
-
# Attempt shutdown anyway just in case resources need cleaning
|
| 1025 |
try: await ptb_app.shutdown()
|
| 1026 |
except Exception: logger.error("Error during shutdown of non-running PTB app.", exc_info=True)
|
| 1027 |
else:
|
|
@@ -1030,8 +901,6 @@ async def lifespan(app: Starlette):
|
|
| 1030 |
|
| 1031 |
|
| 1032 |
# --- Flask App Setup (for Webhook Route) ---
|
| 1033 |
-
# We use Flask just for its familiarity in defining the route,
|
| 1034 |
-
# but it runs within Starlette's ASGI context via WSGIMiddleware.
|
| 1035 |
flask_core_app = Flask(__name__)
|
| 1036 |
logger.info("Core Flask app instance created (used by Starlette for routing).")
|
| 1037 |
|
|
@@ -1042,7 +911,6 @@ def index():
|
|
| 1042 |
logger.debug("Health check endpoint '/' accessed.")
|
| 1043 |
bot_status = "Unknown / Not Initialized"
|
| 1044 |
if ptb_app and ptb_app.bot:
|
| 1045 |
-
# Check internal state again (might have changed)
|
| 1046 |
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
| 1047 |
bot_status = f"Running (@{ptb_app.bot.username})" if is_running else f"Initialized/Stopped (@{ptb_app.bot.username})"
|
| 1048 |
return f"Telegram Bot Summarizer - Status: {bot_status} - Listening via Starlette/Uvicorn."
|
|
@@ -1050,60 +918,43 @@ def index():
|
|
| 1050 |
@flask_core_app.route('/webhook', methods=['POST'])
|
| 1051 |
async def webhook() -> Response:
|
| 1052 |
"""Webhook endpoint called by Telegram."""
|
| 1053 |
-
global ptb_app
|
| 1054 |
|
| 1055 |
if not ptb_app:
|
| 1056 |
logger.error("Webhook triggered, but PTB Application instance (ptb_app) is None. Lifespan likely failed.")
|
| 1057 |
-
# Return 503 Service Unavailable
|
| 1058 |
return Response('Bot service is not configured or failed during startup.', status=503)
|
| 1059 |
|
| 1060 |
-
# Check internal state (safer than assuming ptb_app implies running)
|
| 1061 |
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
| 1062 |
if not is_running:
|
| 1063 |
logger.error("Webhook triggered, but PTB Application is not currently running.")
|
| 1064 |
-
# Return 503 Service Unavailable
|
| 1065 |
return Response('Bot service is initialized but not actively running.', status=503)
|
| 1066 |
|
| 1067 |
-
# Proceed with processing the update
|
| 1068 |
logger.debug("Webhook endpoint received POST request from Telegram.")
|
| 1069 |
try:
|
| 1070 |
-
# Use Flask's request object to get JSON data
|
| 1071 |
update_data = await request.get_json()
|
| 1072 |
if not update_data:
|
| 1073 |
logger.warning("Received empty or non-JSON data on webhook.")
|
| 1074 |
return Response('Bad Request: Expected JSON payload.', status=400)
|
| 1075 |
|
| 1076 |
-
# Deserialize JSON into a Telegram Update object
|
| 1077 |
update = Update.de_json(update_data, ptb_app.bot)
|
| 1078 |
logger.debug(f"Processing update_id: {update.update_id} via webhook route.")
|
| 1079 |
-
|
| 1080 |
-
# Process the update using PTB's internal mechanisms
|
| 1081 |
-
# This will dispatch it to the correct handler (CommandHandler, MessageHandler, etc.)
|
| 1082 |
await ptb_app.process_update(update)
|
| 1083 |
-
|
| 1084 |
logger.debug(f"Finished processing update_id: {update.update_id}")
|
| 1085 |
-
# Return 200 OK to Telegram to acknowledge receipt
|
| 1086 |
return Response('ok', status=200)
|
| 1087 |
|
| 1088 |
except json.JSONDecodeError:
|
| 1089 |
logger.error("Failed to decode JSON from Telegram webhook request.", exc_info=True)
|
| 1090 |
return Response('Bad Request: Invalid JSON format.', status=400)
|
| 1091 |
except Exception as e:
|
| 1092 |
-
# Catch potential errors during Update.de_json or ptb_app.process_update
|
| 1093 |
logger.error(f"Error processing update in webhook handler: {e}", exc_info=True)
|
| 1094 |
-
# Return 500 Internal Server Error to Telegram
|
| 1095 |
-
# Telegram will likely retry sending the update later
|
| 1096 |
return Response('Internal Server Error processing update.', status=500)
|
| 1097 |
|
| 1098 |
|
| 1099 |
# --- Create Starlette ASGI Application ---
|
| 1100 |
-
# This is the main application object that Uvicorn/Gunicorn will run.
|
| 1101 |
app = Starlette(
|
| 1102 |
-
debug=False,
|
| 1103 |
-
lifespan=lifespan,
|
| 1104 |
routes=[
|
| 1105 |
-
# Mount the Flask app under the root path. Starlette handles requests
|
| 1106 |
-
# and forwards relevant ones ('/') and ('/webhook') to the Flask app.
|
| 1107 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
| 1108 |
]
|
| 1109 |
)
|
|
@@ -1111,9 +962,6 @@ logger.info("Starlette ASGI application created, configured with lifespan and Fl
|
|
| 1111 |
|
| 1112 |
|
| 1113 |
# --- Development Server Execution Block ---
|
| 1114 |
-
# This block is ONLY for running the Flask app directly for basic testing
|
| 1115 |
-
# WITHOUT the proper ASGI lifespan management (PTB won't start correctly here).
|
| 1116 |
-
# DO NOT use this for deployment. Use `gunicorn main:app` or `uvicorn main:app`.
|
| 1117 |
if __name__ == '__main__':
|
| 1118 |
logger.warning("=" * 50)
|
| 1119 |
logger.warning(" RUNNING SCRIPT DIRECTLY (using __main__) ".center(50, "="))
|
|
@@ -1129,9 +977,6 @@ if __name__ == '__main__':
|
|
| 1129 |
if not TELEGRAM_TOKEN:
|
| 1130 |
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable missing. Aborting direct Flask start.")
|
| 1131 |
else:
|
| 1132 |
-
# Get port from environment or default to 8080 for local dev
|
| 1133 |
local_port = int(os.environ.get('PORT', 8080))
|
| 1134 |
logger.info(f"Starting Flask development server on http://0.0.0.0:{local_port}")
|
| 1135 |
-
# Run the Flask app directly (no Starlette, no lifespan, no PTB)
|
| 1136 |
-
# use_reloader=False is important if debugging PTB setup elsewhere
|
| 1137 |
flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
|
|
|
|
| 1 |
+
# main.py (Revised: Corrected HTTPXRequest init for PTB v20 + Robust Callback Handling)
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import logging
|
|
|
|
| 28 |
from telegram.request import HTTPXRequest # Import the request class
|
| 29 |
|
| 30 |
# --- Other Libraries ---
|
| 31 |
+
import httpx # Keep import, might be useful elsewhere or if upgrading PTB later
|
| 32 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 33 |
import requests
|
| 34 |
from bs4 import BeautifulSoup
|
|
|
|
| 62 |
# --- Environment Variable Loading ---
|
| 63 |
logger.info("Attempting to load secrets...")
|
| 64 |
def get_secret(secret_name):
|
|
|
|
| 65 |
value = os.environ.get(secret_name)
|
| 66 |
if value: logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})")
|
| 67 |
else: logger.warning(f"Secret '{secret_name}': Not Found")
|
|
|
|
| 79 |
# (Keep ALL your functions: is_youtube_url, extract_youtube_id,
|
| 80 |
# get_transcript_via_supadata, get_transcript_via_apify,
|
| 81 |
# get_youtube_transcript, get_website_content_via_requests,
|
| 82 |
+
# get_website_content_via_urltotext_api, generate_summary - unchanged from previous version)
|
| 83 |
|
| 84 |
# Helper Functions
|
| 85 |
def is_youtube_url(url):
|
|
|
|
| 111 |
params = {"videoId": video_id, "format": "text"}
|
| 112 |
headers = {"X-API-Key": api_key}
|
| 113 |
try:
|
|
|
|
| 114 |
logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification - Potential Security Risk)")
|
| 115 |
response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
|
| 116 |
logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
|
|
|
|
| 282 |
if not url: logger.error("[Web Scraper - Requests/BS4] get_website_content_via_requests called with no URL"); return None
|
| 283 |
logger.info(f"[Web Scraper - Requests/BS4] Attempting fetch: {url}")
|
| 284 |
try:
|
|
|
|
| 285 |
headers = {
|
| 286 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
| 287 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
| 288 |
'Accept-Language': 'en-US,en;q=0.9',
|
| 289 |
'Connection': 'keep-alive',
|
| 290 |
+
'DNT': '1',
|
| 291 |
'Upgrade-Insecure-Requests': '1'
|
| 292 |
}
|
| 293 |
response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
|
| 294 |
+
response.raise_for_status()
|
| 295 |
logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
|
| 296 |
|
| 297 |
content_type = response.headers.get('content-type', '').lower()
|
| 298 |
if 'html' not in content_type:
|
| 299 |
logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received: {content_type}. Attempting plain text extraction.")
|
|
|
|
| 300 |
if 'text/plain' in content_type and response.text:
|
| 301 |
logger.info(f"[Web Scraper - Requests/BS4] Extracted plain text content. Length: {len(response.text.strip())}")
|
| 302 |
return response.text.strip()
|
|
|
|
| 304 |
return None
|
| 305 |
|
| 306 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
| 307 |
tags_to_remove = ["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio", "picture", "source"]
|
|
|
|
| 308 |
selectors_to_remove = ['.ad', '#ad', '.ads', '#ads', '.advertisement', '#advertisement', '.banner', '#banner', '.menu', '#menu', '.navigation', '#navigation', '.sidebar', '#sidebar', '.social', '#social', '.share', '#share', '.related', '#related', '.comments', '#comments', '.cookie-consent', '#cookie-consent']
|
|
|
|
| 309 |
for tag in soup(tags_to_remove): tag.decompose()
|
| 310 |
for selector in selectors_to_remove:
|
| 311 |
for element in soup.select(selector): element.decompose()
|
| 312 |
|
| 313 |
+
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
target_element = main_content if main_content else soup.body
|
| 315 |
if not target_element:
|
| 316 |
logger.warning(f"[Web Scraper - Requests/BS4] Could not find a suitable target element (main, article, body) for {url}");
|
| 317 |
return None
|
| 318 |
|
|
|
|
| 319 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
| 320 |
+
text = "\n\n".join(lines)
|
| 321 |
|
| 322 |
+
MIN_TEXT_LENGTH = 100
|
| 323 |
if not text or len(text) < MIN_TEXT_LENGTH:
|
| 324 |
logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is too short (<{MIN_TEXT_LENGTH} chars) after cleaning for {url}. Length: {len(text)}. Content might be JS-rendered or blocked.")
|
| 325 |
+
return None
|
|
|
|
| 326 |
|
| 327 |
logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped and cleaned content from {url}. Final Length: {len(text)}")
|
| 328 |
return text
|
|
|
|
| 340 |
if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
|
| 341 |
logger.info(f"[Web Scraper - URLToText API] Attempting fetch via API: {url}")
|
| 342 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
|
|
|
| 343 |
payload = json.dumps({
|
| 344 |
"url": url,
|
| 345 |
"output_format": "text",
|
| 346 |
+
"extract_main_content": True,
|
| 347 |
+
"render_javascript": True,
|
| 348 |
+
"residential_proxy": False,
|
| 349 |
+
"timeout_render": 20000,
|
| 350 |
})
|
| 351 |
headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
|
| 352 |
try:
|
| 353 |
+
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=60)
|
| 354 |
logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
|
| 355 |
if response.status_code == 200:
|
| 356 |
try:
|
|
|
|
| 359 |
content = content_data.get("content")
|
| 360 |
credits = data.get("credits_used", "N/A")
|
| 361 |
warning = content_data.get("warning")
|
| 362 |
+
error_msg = content_data.get("error")
|
| 363 |
|
| 364 |
if warning: logger.warning(f"[Web Scraper - URLToText API] API Warning for {url}: {warning}")
|
| 365 |
+
if error_msg: logger.error(f"[Web Scraper - URLToText API] API Error reported for {url}: {error_msg}"); return None
|
| 366 |
|
| 367 |
if content and isinstance(content, str):
|
| 368 |
logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API. Length: {len(content.strip())}. Credits Used: {credits}");
|
|
|
|
| 375 |
elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400) to API. Check payload/URL. Response: {response.text[:200]}...")
|
| 376 |
elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401). Check API Key. Response: {response.text[:200]}...")
|
| 377 |
elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402). Check API credits/plan. Response: {response.text[:200]}...")
|
| 378 |
+
elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL / Fetch Error (422) reported by API for {url}. Response: {response.text[:200]}...")
|
| 379 |
elif response.status_code == 429: logger.warning(f"[Web Scraper - URLToText API] Rate Limit Hit (429). Response: {response.text[:200]}...")
|
| 380 |
elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] API Server Error ({response.status_code}). Response: {response.text[:200]}...")
|
| 381 |
else: logger.error(f"[Web Scraper - URLToText API] Unexpected status code {response.status_code} from API. Response: {response.text[:200]}...")
|
| 382 |
+
return None
|
| 383 |
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout connecting to API for {url}"); return None
|
| 384 |
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API: {e}"); return None
|
| 385 |
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call: {e}", exc_info=True); return None
|
|
|
|
| 392 |
if not text or not text.strip(): logger.warning("generate_summary called with empty or whitespace-only text."); return "Error: No content was provided to summarize."
|
| 393 |
|
| 394 |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
|
|
|
| 395 |
model_name = "deepseek/deepseek-chat:free"
|
|
|
|
| 396 |
|
|
|
|
| 397 |
if summary_type == "paragraph":
|
| 398 |
system_message = (
|
| 399 |
"You are an expert summarization AI. Your goal is to provide a concise, easy-to-understand summary of the provided text. "
|
|
|
|
| 427 |
else:
|
| 428 |
logger.error(f"Invalid summary_type '{summary_type}' requested.")
|
| 429 |
return f"Error: Invalid summary type ('{summary_type}') requested. Please choose 'paragraph' or 'points'."
|
|
|
|
| 430 |
|
| 431 |
+
MAX_INPUT_TOKENS_ESTIMATE = 28000
|
|
|
|
|
|
|
|
|
|
| 432 |
AVG_CHARS_PER_TOKEN = 4
|
| 433 |
MAX_INPUT_LENGTH = MAX_INPUT_TOKENS_ESTIMATE * AVG_CHARS_PER_TOKEN
|
| 434 |
|
|
|
|
| 437 |
truncation_marker = "\n\n[... Text truncated due to length ...]"
|
| 438 |
text = text[:MAX_INPUT_LENGTH - len(truncation_marker)] + truncation_marker
|
| 439 |
|
|
|
|
| 440 |
messages = [
|
| 441 |
{"role": "system", "content": system_message},
|
| 442 |
{"role": "user", "content": f"{user_prompt_instruction}\n\n--- TEXT TO SUMMARIZE ---\n\n{text}\n\n--- END OF TEXT ---"}
|
| 443 |
]
|
| 444 |
|
| 445 |
+
space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME")
|
|
|
|
| 446 |
referer_url = f"https://{space_host}" if space_host and not space_host.startswith("http") else space_host or "https://huggingface.co"
|
| 447 |
headers = {
|
| 448 |
"Authorization": f"Bearer {api_key}",
|
| 449 |
"Content-Type": "application/json",
|
| 450 |
"HTTP-Referer": referer_url,
|
| 451 |
+
"X-Title": "Telegram URL Summarizer Bot"
|
| 452 |
}
|
| 453 |
payload = json.dumps({"model": model_name, "messages": messages})
|
| 454 |
|
| 455 |
try:
|
| 456 |
logger.debug(f"Sending request to OpenRouter (Model: {model_name}). Prompt length approx: {len(text)} chars.")
|
|
|
|
| 457 |
response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=120)
|
| 458 |
logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
| 459 |
|
| 460 |
if response.status_code == 200:
|
| 461 |
try:
|
| 462 |
data = response.json()
|
|
|
|
| 463 |
choice = data.get("choices", [{}])[0]
|
| 464 |
message = choice.get("message", {})
|
| 465 |
summary = message.get("content")
|
|
|
|
| 468 |
if summary and isinstance(summary, str) and summary.strip():
|
| 469 |
summary = summary.strip()
|
| 470 |
logger.info(f"Successfully generated summary. Finish Reason: {finish_reason}. Length: {len(summary)}")
|
| 471 |
+
if summary_type == "paragraph" and len(summary.split()) > 95:
|
|
|
|
| 472 |
logger.warning(f"Generated paragraph summary slightly longer than target word count ({len(summary.split())} words).")
|
| 473 |
return summary
|
| 474 |
else:
|
|
|
|
| 482 |
logger.error(f"Unexpected error processing OpenRouter success response: {e}", exc_info=True)
|
| 483 |
return "Sorry, an unexpected error occurred while processing the AI response."
|
| 484 |
|
|
|
|
| 485 |
elif response.status_code == 401: logger.error("OpenRouter API key is invalid (Unauthorized - 401)."); return "Error: AI service authentication failed. Please check the configuration."
|
| 486 |
elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check credits/limits."); return "Sorry, there's an issue with the AI service account limits or payment."
|
| 487 |
elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Hit (429)."); return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
|
| 488 |
elif response.status_code == 400: logger.error(f"OpenRouter Bad Request (400). Likely prompt issue. Response: {response.text[:500]}..."); return "Sorry, the request to the AI service was invalid (possibly due to the content or prompt)."
|
| 489 |
elif response.status_code >= 500: logger.error(f"OpenRouter Server Error ({response.status_code}). Response: {response.text[:500]}..."); return "Sorry, the AI service is experiencing internal issues. Please try again later."
|
| 490 |
else:
|
|
|
|
| 491 |
logger.error(f"Unexpected HTTP status {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
|
| 492 |
+
try:
|
| 493 |
error_data = response.json()
|
| 494 |
error_msg = error_data.get("error", {}).get("message", response.text[:100])
|
| 495 |
return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
|
|
|
|
| 506 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 507 |
"""Handles the /start command."""
|
| 508 |
user = update.effective_user
|
| 509 |
+
if not user: return
|
| 510 |
logger.info(f"User {user.id} ({user.username or 'NoUsername'}) initiated /start.")
|
|
|
|
| 511 |
mention = user.mention_html() if user.username else user.first_name
|
| 512 |
start_message = (
|
| 513 |
f"👋 Hello {mention}!\n\n"
|
|
|
|
| 534 |
"- **Length:** Very long articles or videos might be truncated before summarization to fit within processing limits.\n\n"
|
| 535 |
"Just send a link to get started!"
|
| 536 |
)
|
|
|
|
| 537 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
| 538 |
|
| 539 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
|
| 541 |
if not update.message or not update.message.text: return
|
| 542 |
message_text = update.message.text.strip()
|
| 543 |
user = update.effective_user
|
| 544 |
+
if not user: return
|
| 545 |
|
|
|
|
|
|
|
| 546 |
url_pattern = r'https?://(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(?:/[^\s]*)?'
|
| 547 |
match = re.search(url_pattern, message_text)
|
| 548 |
|
| 549 |
if match:
|
| 550 |
url = match.group(0)
|
| 551 |
logger.info(f"User {user.id} sent potential URL: {url}")
|
|
|
|
|
|
|
| 552 |
context.user_data['url_to_summarize'] = url
|
| 553 |
logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
|
| 554 |
|
|
|
|
| 559 |
]
|
| 560 |
]
|
| 561 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
|
|
|
|
|
|
| 562 |
await update.message.reply_text(
|
| 563 |
f"✅ Link received:\n`{url}`\n\nChoose your desired summary format:",
|
| 564 |
reply_markup=reply_markup,
|
| 565 |
parse_mode=ParseMode.MARKDOWN,
|
| 566 |
+
link_preview_options={'is_disabled': True}
|
| 567 |
)
|
| 568 |
else:
|
|
|
|
|
|
|
|
|
|
| 569 |
if not message_text.startswith('/'):
|
| 570 |
await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
|
| 571 |
|
|
|
|
| 575 |
query = update.callback_query
|
| 576 |
if not query or not query.from_user:
|
| 577 |
logger.warning("Callback query or user missing in update.")
|
| 578 |
+
return
|
| 579 |
user = query.from_user
|
| 580 |
|
|
|
|
| 581 |
try:
|
| 582 |
+
await query.answer()
|
| 583 |
logger.debug(f"Answered callback query {query.id} for user {user.id}")
|
| 584 |
except TimedOut:
|
|
|
|
| 585 |
logger.warning(f"Timeout answering callback query {query.id} for user {user.id}. Processing continues.")
|
| 586 |
except Exception as e:
|
|
|
|
| 587 |
logger.error(f"Error answering callback query {query.id} for user {user.id}: {e!r}", exc_info=True)
|
| 588 |
|
| 589 |
+
summary_type = query.data
|
|
|
|
| 590 |
url = context.user_data.get('url_to_summarize')
|
| 591 |
logger.info(f"User {user.id} chose summary type '{summary_type}'. Checking for stored URL.")
|
| 592 |
|
| 593 |
if not url:
|
| 594 |
logger.warning(f"User {user.id} pressed button '{summary_type}', but NO URL found in user_data context.")
|
| 595 |
try:
|
|
|
|
| 596 |
await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
|
| 597 |
except TimedOut:
|
| 598 |
logger.error(f"Timeout trying to edit message to inform user {user.id} about lost context.")
|
| 599 |
except Exception as edit_err:
|
|
|
|
| 600 |
logger.error(f"Failed to edit message for lost context for user {user.id}: {edit_err}")
|
| 601 |
+
return
|
| 602 |
|
|
|
|
| 603 |
logger.info(f"Processing URL '{url}' for user {user.id} with type '{summary_type}'.")
|
|
|
|
| 604 |
context.user_data.pop('url_to_summarize', None)
|
| 605 |
logger.debug(f"Cleared URL from user_data for user {user.id}")
|
| 606 |
|
|
|
|
| 607 |
current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
|
| 608 |
current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
|
| 609 |
current_supadata_key = os.environ.get('SUPADATA_API_KEY')
|
| 610 |
current_apify_token = os.environ.get('APIFY_API_TOKEN')
|
|
|
|
| 611 |
keys_present = f"OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}"
|
| 612 |
logger.debug(f"API Key check for user {user.id} request: {keys_present}")
|
| 613 |
|
|
|
|
| 614 |
if not current_openrouter_key:
|
| 615 |
logger.error(f"CRITICAL: OpenRouter API key is missing. Cannot generate summary for user {user.id}.")
|
| 616 |
try:
|
|
|
|
| 621 |
logger.error(f"Failed to edit message for missing AI key for user {user.id}: {edit_err}")
|
| 622 |
return
|
| 623 |
|
|
|
|
| 624 |
processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
|
| 625 |
+
message_to_edit = query.message
|
| 626 |
+
status_message_sent = None
|
| 627 |
|
| 628 |
try:
|
| 629 |
if message_to_edit:
|
| 630 |
await query.edit_message_text(text=processing_message_text)
|
| 631 |
logger.debug(f"Edited original message {message_to_edit.message_id} to show 'Working...' status for query {query.id}")
|
| 632 |
else:
|
|
|
|
| 633 |
logger.warning(f"Original message (query.message) not found for query {query.id}. Cannot edit, will send new status message.")
|
| 634 |
+
raise ValueError("Original message object missing")
|
| 635 |
except (TimedOut, Exception) as e:
|
|
|
|
| 636 |
logger.warning(f"Could not edit original message {message_to_edit.message_id if message_to_edit else 'N/A'} for query {query.id}: {e!r}. Attempting to send a new status message.")
|
| 637 |
+
message_to_edit = None
|
| 638 |
try:
|
| 639 |
status_message_sent = await context.bot.send_message(chat_id=user.id, text=processing_message_text)
|
| 640 |
logger.debug(f"Sent new status message {status_message_sent.message_id} to user {user.id}.")
|
| 641 |
except TimedOut:
|
| 642 |
logger.error(f"Timeout sending NEW 'Working...' status message to user {user.id}. Processing continues without feedback.")
|
|
|
|
| 643 |
except Exception as send_err:
|
| 644 |
logger.error(f"Failed sending NEW 'Working...' status message to user {user.id}: {send_err}. Processing continues without feedback.")
|
|
|
|
| 645 |
|
|
|
|
| 646 |
content = None
|
| 647 |
+
user_feedback_message = None
|
| 648 |
+
success = False
|
| 649 |
|
| 650 |
try:
|
|
|
|
| 651 |
try:
|
| 652 |
logger.debug(f"Sending 'typing' chat action to chat {user.id}")
|
| 653 |
await context.bot.send_chat_action(chat_id=user.id, action='typing')
|
| 654 |
except TimedOut: logger.warning(f"Timeout sending 'typing' action for user {user.id}.")
|
| 655 |
except Exception as ca_err: logger.warning(f"Failed sending 'typing' action for user {user.id}: {ca_err}")
|
| 656 |
|
|
|
|
| 657 |
is_yt = is_youtube_url(url)
|
| 658 |
logger.debug(f"URL ({url}) is YouTube: {is_yt} (User: {user.id})")
|
| 659 |
|
|
|
|
| 671 |
logger.warning(f"Failed to extract YouTube video ID from URL: {url} (User: {user.id})")
|
| 672 |
user_feedback_message = "⚠️ Sorry, I couldn't identify a valid YouTube video ID in the link you provided."
|
| 673 |
else:
|
|
|
|
| 674 |
logger.info(f"Attempting website scrape (Requests/BS4) for URL: {url} (User: {user.id})")
|
| 675 |
content = await get_website_content_via_requests(url)
|
| 676 |
if content:
|
| 677 |
logger.info(f"Website scrape successful (Requests/BS4). Length: {len(content)} (User: {user.id})")
|
|
|
|
| 678 |
else:
|
| 679 |
logger.warning(f"Primary website scrape failed for {url} (User: {user.id}). Trying fallback API.")
|
| 680 |
if current_urltotext_key:
|
|
|
|
| 681 |
try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before fallback scrape.")
|
| 682 |
+
except: pass
|
| 683 |
|
| 684 |
logger.info(f"Attempting website scrape via URLToText API for: {url} (User: {user.id})")
|
| 685 |
content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
|
|
|
|
| 689 |
logger.warning(f"Fallback website scrape (URLToText API) also failed for {url} (User: {user.id}).")
|
| 690 |
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website using available methods. It might be protected or structured in a way I can't parse."
|
| 691 |
else:
|
|
|
|
| 692 |
logger.warning(f"Primary scrape failed and URLToText API key not configured. Cannot fallback for {url} (User: {user.id}).")
|
| 693 |
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website, and the fallback service isn't configured."
|
| 694 |
|
|
|
|
| 695 |
if content:
|
| 696 |
logger.info(f"Content fetched (Length: {len(content)}). Generating '{summary_type}' summary for user {user.id}.")
|
|
|
|
| 697 |
try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before AI summary generation.")
|
| 698 |
except: pass
|
| 699 |
|
| 700 |
summary = await generate_summary(content, summary_type, current_openrouter_key)
|
| 701 |
|
|
|
|
| 702 |
if summary.startswith("Error:") or summary.startswith("Sorry,"):
|
| 703 |
logger.warning(f"AI summary generation failed for user {user.id}. Reason: {summary}")
|
| 704 |
+
user_feedback_message = f"⚠️ {summary}"
|
| 705 |
else:
|
|
|
|
| 706 |
logger.info(f"Summary generated successfully for user {user.id}. Length: {len(summary)}. Sending result.")
|
| 707 |
try:
|
| 708 |
await context.bot.send_message(
|
| 709 |
chat_id=user.id,
|
| 710 |
text=summary,
|
| 711 |
+
parse_mode=ParseMode.MARKDOWN,
|
| 712 |
link_preview_options={'is_disabled': True}
|
| 713 |
)
|
| 714 |
success = True
|
| 715 |
+
user_feedback_message = None
|
| 716 |
logger.info(f"Successfully sent summary to user {user.id}.")
|
| 717 |
except TimedOut:
|
| 718 |
logger.error(f"Timeout sending final summary message to user {user.id}.")
|
| 719 |
user_feedback_message = "⚠️ Sorry, there was a timeout while trying to send you the final summary."
|
| 720 |
+
success = False
|
| 721 |
except Exception as send_final_err:
|
| 722 |
logger.error(f"Failed sending final summary to user {user.id}: {send_final_err}", exc_info=True)
|
| 723 |
user_feedback_message = "⚠️ Sorry, an unexpected error occurred while sending the final summary."
|
| 724 |
+
success = False
|
| 725 |
|
| 726 |
elif not user_feedback_message:
|
|
|
|
| 727 |
logger.warning(f"Content retrieval resulted in None, but no specific user feedback message was set. URL: {url} (User: {user.id})")
|
| 728 |
user_feedback_message = "⚠️ Sorry, I couldn't retrieve any usable content from the link provided."
|
| 729 |
|
|
|
|
| 730 |
if user_feedback_message and not success:
|
| 731 |
logger.warning(f"Processing failed or summary sending failed for user {user.id}. Sending feedback: {user_feedback_message}")
|
| 732 |
try:
|
|
|
|
| 737 |
logger.error(f"Failed sending final FAILURE feedback message to user {user.id}: {send_feedback_err}")
|
| 738 |
|
| 739 |
except Exception as e:
|
|
|
|
| 740 |
logger.error(f"Unexpected critical error during callback processing for user {user.id}, URL {url}: {e}", exc_info=True)
|
| 741 |
try:
|
|
|
|
| 742 |
await context.bot.send_message(chat_id=user.id, text="❌ Oops! An unexpected internal error occurred while processing your request. The issue has been logged.")
|
| 743 |
except TimedOut:
|
| 744 |
logger.error(f"Timeout sending CRITICAL internal error feedback message to user {user.id}.")
|
| 745 |
except Exception as final_err:
|
|
|
|
| 746 |
logger.error(f"Failed sending CRITICAL internal error feedback message to user {user.id}: {final_err}")
|
|
|
|
| 747 |
success = False
|
| 748 |
|
| 749 |
finally:
|
|
|
|
| 750 |
logger.debug(f"Cleaning up status message(s) for user {user.id}, query {query.id}. Success={success}")
|
| 751 |
try:
|
| 752 |
if status_message_sent:
|
|
|
|
|
|
|
| 753 |
await context.bot.delete_message(chat_id=user.id, message_id=status_message_sent.message_id)
|
| 754 |
logger.debug(f"Deleted separate status message {status_message_sent.message_id} for user {user.id}.")
|
| 755 |
elif message_to_edit:
|
|
|
|
| 756 |
if success:
|
|
|
|
| 757 |
await query.delete_message()
|
| 758 |
logger.debug(f"Processing succeeded. Deleted original (edited) message {message_to_edit.message_id} for query {query.id}.")
|
| 759 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
logger.debug(f"Processing failed. Leaving edited message {message_to_edit.message_id} in place for query {query.id}.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 761 |
|
| 762 |
except TimedOut:
|
| 763 |
logger.warning(f"Timeout attempting to delete status/button message for user {user.id}, query {query.id}.")
|
| 764 |
except Exception as del_e:
|
|
|
|
|
|
|
| 765 |
logger.warning(f"Could not delete status/button message for user {user.id}, query {query.id}: {del_e!r}")
|
| 766 |
|
|
|
|
| 767 |
logger.info(f"Finished handling callback query {query.id} for user {user.id}. Overall Success: {success}")
|
| 768 |
|
| 769 |
|
| 770 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 771 |
"""Log Errors caused by Updates."""
|
| 772 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
|
|
|
| 773 |
if isinstance(context.error, TimedOut):
|
| 774 |
logger.warning("A timeout error occurred in PTB communication.")
|
| 775 |
elif isinstance(context.error, NetworkError):
|
| 776 |
logger.warning(f"A network error occurred: {context.error}")
|
|
|
|
| 777 |
|
| 778 |
+
# --- Bot Setup Function (Corrected: Removed invalid 'limits' param for PTB v20) ---
|
| 779 |
async def setup_bot_config() -> Application:
|
| 780 |
+
"""Configures the PTB Application with custom HTTPX settings for PTB v20.x."""
|
| 781 |
logger.info("Configuring Telegram Application...")
|
| 782 |
if not TELEGRAM_TOKEN:
|
| 783 |
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
|
| 784 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
| 785 |
|
| 786 |
+
# --- Configure HTTPX client settings (Timeouts ONLY for PTB v20) ---
|
| 787 |
connect_timeout = 10.0 # Slightly higher connect timeout
|
|
|
|
| 788 |
read_timeout = 30.0 # Increased timeout for reading response
|
| 789 |
write_timeout = 30.0 # Increased timeout for sending request
|
| 790 |
pool_timeout = 30.0 # Increased timeout for getting connection from pool
|
| 791 |
+
# NOTE: PTB v20.x HTTPXRequest does not allow setting pool *size* directly here.
|
| 792 |
+
# It will use the httpx default (usually 10).
|
| 793 |
|
| 794 |
+
logger.info(f"Creating PTB HTTPXRequest (v20 compatible) with settings: "
|
| 795 |
f"connect_timeout={connect_timeout}, read_timeout={read_timeout}, "
|
| 796 |
+
f"write_timeout={write_timeout}, pool_timeout={pool_timeout}. "
|
| 797 |
+
f"(Pool size uses httpx default)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 798 |
|
| 799 |
+
# Create a custom request object with ONLY the supported timeout parameters
|
| 800 |
custom_request = HTTPXRequest(
|
| 801 |
connect_timeout=connect_timeout,
|
| 802 |
read_timeout=read_timeout,
|
| 803 |
write_timeout=write_timeout,
|
| 804 |
pool_timeout=pool_timeout,
|
| 805 |
+
# REMOVED: limits=custom_limits, <<<--- This was the error
|
| 806 |
+
http_version="1.1"
|
| 807 |
)
|
| 808 |
|
| 809 |
# Use Application.builder() and pass the custom request object
|
| 810 |
application_builder = Application.builder().token(TELEGRAM_TOKEN)
|
| 811 |
application_builder.request(custom_request)
|
| 812 |
+
# application_builder.get_updates_request(custom_request) # Apply if using polling
|
|
|
|
|
|
|
|
|
|
| 813 |
|
| 814 |
# Build the application instance
|
| 815 |
application = application_builder.build()
|
|
|
|
| 817 |
# --- Register Handlers ---
|
| 818 |
application.add_handler(CommandHandler("start", start))
|
| 819 |
application.add_handler(CommandHandler("help", help_command))
|
|
|
|
| 820 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
|
|
|
| 821 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
|
|
|
| 822 |
application.add_error_handler(error_handler)
|
| 823 |
|
| 824 |
logger.info("Telegram application handlers configured.")
|
|
|
|
| 830 |
"""Handles PTB startup and shutdown during ASGI lifespan."""
|
| 831 |
global ptb_app
|
| 832 |
logger.info("ASGI Lifespan: Startup sequence initiated...")
|
|
|
|
| 833 |
|
| 834 |
try:
|
|
|
|
| 835 |
ptb_app = await setup_bot_config()
|
| 836 |
logger.info("PTB Application object configured. Initializing...")
|
| 837 |
+
await ptb_app.initialize()
|
| 838 |
logger.info("PTB Application initialized. Starting background tasks (e.g., job queue)...")
|
|
|
|
| 839 |
await ptb_app.start()
|
| 840 |
+
if ptb_app.updater: ptb_app.updater.stop() # Ensure polling is stopped
|
| 841 |
bot_instance = ptb_app.bot
|
| 842 |
bot_info = await bot_instance.get_me()
|
| 843 |
logger.info(f"PTB Application started successfully. Bot ID: {bot_info.id}, Username: @{bot_info.username}")
|
| 844 |
|
|
|
|
|
|
|
| 845 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
| 846 |
if WEBHOOK_URL_BASE:
|
|
|
|
| 847 |
if not WEBHOOK_URL_BASE.startswith("https://"): WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
|
| 848 |
+
webhook_path = "/webhook"
|
| 849 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
| 850 |
|
| 851 |
logger.info(f"Attempting to set Telegram webhook to: {full_webhook_url}")
|
|
|
|
| 852 |
await asyncio.sleep(2.0)
|
| 853 |
try:
|
|
|
|
| 854 |
await bot_instance.set_webhook(
|
| 855 |
url=full_webhook_url,
|
| 856 |
+
allowed_updates=Update.ALL_TYPES,
|
|
|
|
|
|
|
| 857 |
)
|
|
|
|
| 858 |
webhook_info = await bot_instance.get_webhook_info()
|
| 859 |
if webhook_info.url == full_webhook_url:
|
| 860 |
logger.info(f"Telegram webhook set successfully! Current info: {webhook_info}")
|
| 861 |
else:
|
| 862 |
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got: {webhook_info.url}. Info: {webhook_info}")
|
| 863 |
except RetryAfter as e:
|
|
|
|
| 864 |
logger.warning(f"Webhook setting throttled by Telegram (RetryAfter: {e.retry_after}s). Another instance likely succeeded or try again later.")
|
|
|
|
| 865 |
await asyncio.sleep(e.retry_after or 2)
|
| 866 |
webhook_info = await bot_instance.get_webhook_info()
|
| 867 |
logger.info(f"Webhook info after RetryAfter delay: {webhook_info}")
|
|
|
|
| 875 |
|
| 876 |
except Exception as startup_err:
|
| 877 |
logger.critical(f"CRITICAL ERROR during ASGI application startup: {startup_err}", exc_info=True)
|
|
|
|
| 878 |
raise
|
| 879 |
finally:
|
|
|
|
| 880 |
logger.info("ASGI Lifespan: Shutdown sequence initiated...")
|
| 881 |
if ptb_app:
|
| 882 |
bot_username = ptb_app.bot.username if ptb_app.bot else "N/A"
|
| 883 |
logger.info(f"PTB App instance found for @{bot_username}. Checking if running...")
|
|
|
|
|
|
|
| 884 |
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
| 885 |
if is_running:
|
| 886 |
try:
|
| 887 |
logger.info("Stopping PTB Application's background tasks...")
|
| 888 |
+
await ptb_app.stop()
|
| 889 |
logger.info("Shutting down PTB Application connections and resources...")
|
| 890 |
+
await ptb_app.shutdown()
|
| 891 |
logger.info("PTB Application shut down gracefully.")
|
| 892 |
except Exception as shutdown_err:
|
| 893 |
logger.error(f"Error during PTB Application shutdown: {shutdown_err}", exc_info=True)
|
| 894 |
else:
|
| 895 |
logger.warning("PTB Application instance exists but was not marked as running at shutdown.")
|
|
|
|
| 896 |
try: await ptb_app.shutdown()
|
| 897 |
except Exception: logger.error("Error during shutdown of non-running PTB app.", exc_info=True)
|
| 898 |
else:
|
|
|
|
| 901 |
|
| 902 |
|
| 903 |
# --- Flask App Setup (for Webhook Route) ---
|
|
|
|
|
|
|
| 904 |
flask_core_app = Flask(__name__)
|
| 905 |
logger.info("Core Flask app instance created (used by Starlette for routing).")
|
| 906 |
|
|
|
|
| 911 |
logger.debug("Health check endpoint '/' accessed.")
|
| 912 |
bot_status = "Unknown / Not Initialized"
|
| 913 |
if ptb_app and ptb_app.bot:
|
|
|
|
| 914 |
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
| 915 |
bot_status = f"Running (@{ptb_app.bot.username})" if is_running else f"Initialized/Stopped (@{ptb_app.bot.username})"
|
| 916 |
return f"Telegram Bot Summarizer - Status: {bot_status} - Listening via Starlette/Uvicorn."
|
|
|
|
| 918 |
@flask_core_app.route('/webhook', methods=['POST'])
|
| 919 |
async def webhook() -> Response:
|
| 920 |
"""Webhook endpoint called by Telegram."""
|
| 921 |
+
global ptb_app
|
| 922 |
|
| 923 |
if not ptb_app:
|
| 924 |
logger.error("Webhook triggered, but PTB Application instance (ptb_app) is None. Lifespan likely failed.")
|
|
|
|
| 925 |
return Response('Bot service is not configured or failed during startup.', status=503)
|
| 926 |
|
|
|
|
| 927 |
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
| 928 |
if not is_running:
|
| 929 |
logger.error("Webhook triggered, but PTB Application is not currently running.")
|
|
|
|
| 930 |
return Response('Bot service is initialized but not actively running.', status=503)
|
| 931 |
|
|
|
|
| 932 |
logger.debug("Webhook endpoint received POST request from Telegram.")
|
| 933 |
try:
|
|
|
|
| 934 |
update_data = await request.get_json()
|
| 935 |
if not update_data:
|
| 936 |
logger.warning("Received empty or non-JSON data on webhook.")
|
| 937 |
return Response('Bad Request: Expected JSON payload.', status=400)
|
| 938 |
|
|
|
|
| 939 |
update = Update.de_json(update_data, ptb_app.bot)
|
| 940 |
logger.debug(f"Processing update_id: {update.update_id} via webhook route.")
|
|
|
|
|
|
|
|
|
|
| 941 |
await ptb_app.process_update(update)
|
|
|
|
| 942 |
logger.debug(f"Finished processing update_id: {update.update_id}")
|
|
|
|
| 943 |
return Response('ok', status=200)
|
| 944 |
|
| 945 |
except json.JSONDecodeError:
|
| 946 |
logger.error("Failed to decode JSON from Telegram webhook request.", exc_info=True)
|
| 947 |
return Response('Bad Request: Invalid JSON format.', status=400)
|
| 948 |
except Exception as e:
|
|
|
|
| 949 |
logger.error(f"Error processing update in webhook handler: {e}", exc_info=True)
|
|
|
|
|
|
|
| 950 |
return Response('Internal Server Error processing update.', status=500)
|
| 951 |
|
| 952 |
|
| 953 |
# --- Create Starlette ASGI Application ---
|
|
|
|
| 954 |
app = Starlette(
|
| 955 |
+
debug=False,
|
| 956 |
+
lifespan=lifespan,
|
| 957 |
routes=[
|
|
|
|
|
|
|
| 958 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
| 959 |
]
|
| 960 |
)
|
|
|
|
| 962 |
|
| 963 |
|
| 964 |
# --- Development Server Execution Block ---
|
|
|
|
|
|
|
|
|
|
| 965 |
if __name__ == '__main__':
|
| 966 |
logger.warning("=" * 50)
|
| 967 |
logger.warning(" RUNNING SCRIPT DIRECTLY (using __main__) ".center(50, "="))
|
|
|
|
| 977 |
if not TELEGRAM_TOKEN:
|
| 978 |
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable missing. Aborting direct Flask start.")
|
| 979 |
else:
|
|
|
|
| 980 |
local_port = int(os.environ.get('PORT', 8080))
|
| 981 |
logger.info(f"Starting Flask development server on http://0.0.0.0:{local_port}")
|
|
|
|
|
|
|
| 982 |
flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
|