fmab777 commited on
Commit
1d82147
·
verified ·
1 Parent(s): 549270f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +717 -257
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Revised: Correct HTTPXRequest init + Starlette Lifespan + Updated Prompts)
2
  import os
3
  import re
4
  import logging
@@ -24,11 +24,11 @@ from telegram.ext import (
24
  CallbackQueryHandler,
25
  )
26
  from telegram.constants import ParseMode
27
- from telegram.error import NetworkError, RetryAfter # Import RetryAfter
28
  from telegram.request import HTTPXRequest # Import the request class
29
 
30
  # --- Other Libraries ---
31
- # import httpx # No longer needed for direct import
32
  from youtube_transcript_api import YouTubeTranscriptApi
33
  import requests
34
  from bs4 import BeautifulSoup
@@ -43,6 +43,7 @@ logging.basicConfig(
43
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
44
  level=logging.DEBUG
45
  )
 
46
  logging.getLogger("httpx").setLevel(logging.WARNING)
47
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
48
  logging.getLogger("telegram.ext").setLevel(logging.INFO)
@@ -51,6 +52,7 @@ logging.getLogger("urllib3").setLevel(logging.INFO)
51
  logging.getLogger('gunicorn.error').setLevel(logging.INFO)
52
  logging.getLogger('uvicorn').setLevel(logging.INFO)
53
  logging.getLogger('starlette').setLevel(logging.INFO)
 
54
  logger = logging.getLogger(__name__)
55
  logger.info("Logging configured.")
56
 
@@ -60,7 +62,7 @@ ptb_app: Application | None = None
60
  # --- Environment Variable Loading ---
61
  logger.info("Attempting to load secrets...")
62
  def get_secret(secret_name):
63
- logger.debug(f"Attempting to read secret: {secret_name}")
64
  value = os.environ.get(secret_name)
65
  if value: logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})")
66
  else: logger.warning(f"Secret '{secret_name}': Not Found")
@@ -79,7 +81,6 @@ logger.info("Secret loading attempt finished.")
79
  # get_transcript_via_supadata, get_transcript_via_apify,
80
  # get_youtube_transcript, get_website_content_via_requests,
81
  # get_website_content_via_urltotext_api, generate_summary)
82
- # Ensure the generate_summary has the updated prompts from previous response
83
 
84
  # Helper Functions
85
  def is_youtube_url(url):
@@ -111,7 +112,8 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
111
  params = {"videoId": video_id, "format": "text"}
112
  headers = {"X-API-Key": api_key}
113
  try:
114
- logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification)")
 
115
  response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
116
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
117
  if response.status_code == 200:
@@ -253,268 +255,625 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
253
  except Exception as e:
254
  logger.warning(f"[Primary YT] Error via library: {type(e).__name__} - {e}")
255
  if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e): logger.warning("[Primary YT] IP likely blocked.")
256
- elif "No transcript found" in str(e): logger.warning("[Primary YT] No transcript in languages.")
257
- elif "TranscriptsDisabled" in str(e) or "disabled" in str(e): logger.warning("[Primary YT] Transcripts disabled.")
258
- transcript_text = None
259
 
260
  if transcript_text is None: # Fallback 1: Supadata
261
  logger.info("[Fallback YT 1] Trying Supadata API...")
262
  if supadata_key:
263
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
264
  if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata. Length: {len(transcript_text)}"); return transcript_text
265
- else: logger.warning("[Fallback YT 1] Supadata failed or no content.")
266
- else: logger.warning("[Fallback YT 1] Supadata key not available.")
267
 
268
  if transcript_text is None: # Fallback 2: Apify
269
  logger.info("[Fallback YT 2] Trying Apify API...")
270
  if apify_token:
271
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
272
  if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify. Length: {len(transcript_text)}"); return transcript_text
273
- else: logger.warning("[Fallback YT 2] Apify failed or no content.")
274
- else: logger.warning("[Fallback YT 2] Apify token not available.")
275
 
276
- if transcript_text is None: logger.error(f"All methods failed for video ID: {video_id}")
277
  return transcript_text
278
 
279
  # Website Content via Requests/BS4
280
  async def get_website_content_via_requests(url):
281
  """Attempts to scrape website content using requests/BeautifulSoup."""
282
- if not url: logger.error("[Web Scraper - Requests/BS4] no URL"); return None
283
- logger.info(f"[Web Scraper - Requests/BS4] Fetching: {url}")
284
  try:
285
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Language': 'en-US,en;q=0.9','Connection': 'keep-alive','DNT': '1','Upgrade-Insecure-Requests': '1'}
 
 
 
 
 
 
 
 
286
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
287
- response.raise_for_status()
288
  logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
 
289
  content_type = response.headers.get('content-type', '').lower()
290
  if 'html' not in content_type:
291
- logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML: {content_type}. Plain text?")
292
- if 'text/plain' in content_type and response.text: return response.text.strip()
 
 
 
 
293
  return None
 
294
  soup = BeautifulSoup(response.text, 'html.parser')
295
- for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]): element.extract()
296
- main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  target_element = main_content if main_content else soup.body
298
- if not target_element: logger.warning(f"[Web Scraper - Requests/BS4] No body/main content for {url}"); return None
 
 
 
 
299
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
300
- text = "\n".join(lines)
301
- MIN_TEXT_LENGTH = 50
302
- if not text or len(text) < MIN_TEXT_LENGTH: logger.warning(f"[Web Scraper - Requests/BS4] Text too short (<{MIN_TEXT_LENGTH} chars) for {url} (Len: {len(text)})")
303
- logger.info(f"[Web Scraper - Requests/BS4] Success scraping {url} (Len: {len(text)})")
 
 
 
 
 
304
  return text
305
- except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout: {url}"); return None
306
- except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Redirects: {url}"); return None
307
- except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] Request error {url}: {e}"); return None
308
- except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Parsing error {url}: {e}", exc_info=True); return None
 
 
309
 
310
  # Website Content via URLToText API
311
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
312
  """Fetches website content using the URLToText API."""
313
- if not url: logger.error("[Web Scraper - URLToText API] no URL"); return None
314
- if not api_key: logger.error("[Web Scraper - URLToText API] API key missing."); return None
315
- logger.info(f"[Web Scraper - URLToText API] Attempting fetch: {url}")
316
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
317
- payload = json.dumps({"url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False})
 
 
 
 
 
 
 
 
318
  headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
319
  try:
320
- response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45)
321
- logger.debug(f"[Web Scraper - URLToText API] Status {response.status_code} for {url}")
322
  if response.status_code == 200:
323
  try:
324
  data = response.json()
325
- content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning")
326
- if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}")
327
- if content: logger.info(f"[Web Scraper - URLToText API] Success via API. Length: {len(content)}. Credits: {credits}"); return content.strip()
328
- else: logger.warning(f"[Web Scraper - URLToText API] API success (200) but content empty. Resp: {data}"); return None
329
- except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed JSON decode. Status: {response.status_code}. Resp: {response.text[:500]}..."); return None
330
- except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing API response: {e}", exc_info=True); return None
331
- elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400). Resp: {response.text[:200]}...")
332
- elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401). Check Key. Resp: {response.text[:200]}...")
333
- elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402). Check credits. Resp: {response.text[:200]}...")
334
- elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL (422). Resp: {response.text[:200]}...")
335
- elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] Server Error ({response.status_code}). Resp: {response.text[:200]}...")
336
- else: logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code}. Resp: {response.text[:200]}...")
337
- return None
338
- except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout: {url}"); return None
339
- except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error: {e}"); return None
340
- except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  # DeepSeek Summary Function (with updated prompts)
343
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
344
  """Generates summary using DeepSeek via OpenRouter API."""
345
  logger.info(f"Generating '{summary_type}' summary. Input length: {len(text)}")
346
- if not api_key: logger.error("OpenRouter API key missing."); return "Error: AI config key missing."
347
- if not text: logger.warning("generate_summary called with empty text."); return "Error: No content to summarize."
348
- openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"; model_name = "deepseek/deepseek-chat:free"
 
 
 
 
349
 
350
  # --- UPDATED PROMPTS ---
351
  if summary_type == "paragraph":
352
- prompt = (
353
- "You are an AI model designed to provide concise summaries using British English spellings. "
354
- "Your output should be:\n"
355
- " Clear and simple, use a level of language and vocabulary such that someone who isn’t familiar with the topic mentioned would still be able to understand, keeping British spellings throughout.\n"
356
- " Use straightforward and understandable language. Avoid overly complex or advanced vocabulary.\n"
357
- " Presented in one paragraph.\n"
358
- " A summary that is no more than 85 words; ensure it remains concise.\n"
359
- " Consider the entire text’s content, not just the beginning or a few topics: give equal attention to all parts.\n"
360
- " Do not use em dash (– or —) symbols; use semicolons instead.\n\n"
361
- "Now, please summarize the following text according to these rules:"
 
362
  )
 
 
363
  elif summary_type == "points":
364
- prompt = (
365
- "You are an AI model designed to provide concise summaries using British English spellings.\n"
366
- "Your output should be:\n"
367
- " Clear and simple, use a level of language and vocabulary such that someone who isn’t familiar with the topic mentioned would still be able to understand, keeping British spellings throughout.\n"
368
- " Use straightforward and understandable language. Avoid overly complex or advanced vocabulary.\n"
369
- " Presented as clear, distinct bullet points (using '*' or '-' at the start of each point).\n"
370
- " Each point should highlight a key piece of information, finding, or main topic discussed.\n"
371
- " Consider the entire text’s content, not just the beginning or a few topics: give equal attention to all parts.\n"
372
- " Do not use em dash (– or —) symbols; use semicolons instead.\n\n"
373
- "Now, please summarize the following text into bullet points according to these rules:"
 
 
374
  )
 
375
  else:
376
- logger.error(f"Invalid summary_type '{summary_type}'.");
377
- return f"Error: Invalid summary type ('{summary_type}')."
378
  # --- END UPDATED PROMPTS ---
379
 
380
- MAX_INPUT_LENGTH = 500000 # Keep practical limit
381
- if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input text ({len(text)}) > limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Truncated)"
382
- full_prompt = f"{prompt}\n\n--- Start of Text ---\n\n{text}\n\n--- End of Text ---"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
- space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME") # Replace with actual space if needed
385
- referer_url = f"https://{space_host}" if not space_host.startswith("http") else space_host
386
- headers = {"Authorization": f"Bearer {api_key}","Content-Type": "application/json","HTTP-Referer": referer_url,"X-Title": "Telegram URL Summarizer Bot"}
387
- payload = json.dumps({"model": model_name, "messages": [{"role": "user", "content": full_prompt}]})
 
 
 
 
 
 
388
 
389
  try:
390
- logger.debug(f"Sending request to OpenRouter (Model: {model_name})...")
391
- response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=90)
 
392
  logger.debug(f"Received status {response.status_code} from OpenRouter.")
 
393
  if response.status_code == 200:
394
  try:
395
  data = response.json()
396
- if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
397
- message = data["choices"][0].get("message")
398
- if message and message.get("content"):
399
- summary = message["content"].strip()
400
- if summary: logger.info(f"Success generating summary. Len: {len(summary)}"); return summary
401
- else: logger.warning(f"OpenRouter success but empty content. Resp: {data}"); return "Sorry, AI model returned empty summary."
402
- else: logger.warning(f"OpenRouter success but missing content field. Resp: {data}"); return "Sorry, could not parse AI response (content)."
403
- elif data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}"); return f"Sorry, AI service error: {data['error'].get('message', 'Unknown')}"
404
- else: logger.error(f"Unexpected OpenRouter choices structure. Resp: {data}"); return "Sorry, could not parse AI response (choices)."
405
- except json.JSONDecodeError: logger.error(f"Failed JSON decode from OpenRouter. Status: {response.status_code}. Resp: {response.text[:500]}..."); return "Sorry, failed to understand AI response format."
406
- except Exception as e: logger.error(f"Error processing OpenRouter success resp: {e}", exc_info=True); return "Sorry, error processing AI response."
407
- elif response.status_code == 401: logger.error("OpenRouter key invalid (401)."); return "Error: AI model config key invalid."
408
- elif response.status_code == 402: logger.error("OpenRouter Payment Required (402)."); return "Sorry, issue with AI service limits/payment."
409
- elif response.status_code == 429: logger.warning("OpenRouter Rate Limit (429)."); return "Sorry, AI model busy. Try again."
410
- elif response.status_code >= 500: logger.error(f"OpenRouter Internal Error ({response.status_code}). Resp: {response.text[:500]}..."); return "Sorry, AI model service error. Try again later."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  else:
412
- logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp: {response.text[:500]}...")
413
- try: error_data = response.json(); error_msg = error_data.get("error", {}).get("message", response.text[:100]); return f"Sorry, AI service error ({response.status_code}): {error_msg}"
414
- except: return f"Sorry, AI service returned status {response.status_code}."
415
- except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter."); return "Sorry, request to AI model timed out."
416
- except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter: {e}"); return "Sorry, error connecting to AI model service."
417
- except Exception as e: logger.error(f"Unexpected error in generate_summary: {e}", exc_info=True); return "Sorry, unexpected error generating summary."
 
 
 
 
 
 
418
 
419
 
420
  # --- Telegram Bot Handlers ---
421
 
422
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
423
- user = update.effective_user;
424
- if not user: return
425
- logger.info(f"User {user.id} ({user.username or 'NoUsername'}) /start.")
 
 
426
  mention = user.mention_html() if user.username else user.first_name
427
- await update.message.reply_html(f"👋 Hello {mention}! Send a URL to summarize.")
 
 
 
 
 
 
428
 
429
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
430
- user = update.effective_user; logger.info(f"User {user.id if user else '?'} /help.")
431
- help_text = ("**How:**\n1. Send URL.\n2. Choose Paragraph/Points.\n3. Get summary!\n\n"
432
- "**Notes:**\n- YT transcripts can fail.\n- Complex sites hard to scrape.\n- AI errors possible.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
434
 
435
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
436
  if not update.message or not update.message.text: return
437
- message_text = update.message.text.strip(); user = update.effective_user;
438
- if not user: return
439
- url_pattern = r'https?://[^\s/$.?#].[^\s]*'; match = re.search(url_pattern, message_text)
 
 
 
 
 
 
440
  if match:
441
- url = match.group(0); logger.info(f"User {user.id} sent URL: {url}")
442
- context.user_data['url_to_summarize'] = url; logger.debug(f"Stored URL '{url}' for user {user.id}")
443
- keyboard = [[InlineKeyboardButton("Paragraph", callback_data="paragraph"), InlineKeyboardButton("Points", callback_data="points")]]
 
 
 
 
 
 
 
 
 
 
444
  reply_markup = InlineKeyboardMarkup(keyboard)
445
- await update.message.reply_text(f"Link:\n{url}\n\nChoose summary type:", reply_markup=reply_markup, link_preview_options={'is_disabled': True})
446
- else: logger.debug(f"Ignoring non-URL from {user.id}: {message_text[:100]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
449
- query = update.callback_query; user = query.from_user
450
- try: await query.answer(); logger.debug(f"Answered query {query.id}")
451
- except Exception as e: logger.error(f"Failed answer query {query.id}: {e}")
452
- summary_type = query.data; url = context.user_data.get('url_to_summarize')
453
- logger.info(f"User {user.id} chose '{summary_type}'. URL: '{url}'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  if not url:
455
- logger.warning(f"User {user.id} pressed button, NO URL in context.");
456
- try: await query.edit_message_text(text="Context lost. Send link again.")
457
- except Exception as edit_err: logger.error(f"Failed edit on lost context: {edit_err}")
458
- return
459
- context.user_data.pop('url_to_summarize', None); logger.debug(f"Cleared URL {url} for user {user.id}")
460
- current_openrouter_key = os.environ.get('OPENROUTER_API_KEY'); current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
461
- current_supadata_key = os.environ.get('SUPADATA_API_KEY'); current_apify_token = os.environ.get('APIFY_API_TOKEN')
462
- logger.debug(f"Keys: OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  if not current_openrouter_key:
464
- logger.error("OpenRouter key missing.");
465
- try: await query.edit_message_text(text="⚠️ AI service config error (key missing).")
466
- except Exception as edit_err: logger.error(f"Failed edit on missing OR key: {edit_err}")
 
 
 
 
467
  return
468
- processing_message_text = f"Working on '{summary_type}' summary for:\n{url}\n..."; message_to_delete_later_id = None
469
- try: await query.edit_message_text(text=processing_message_text); logger.debug(f"Edited message query {query.id}")
470
- except Exception as e:
471
- logger.warning(f"Could not edit message {query.message.message_id if query.message else 'N/A'}: {e}. Sending new.");
472
- try: status_message = await context.bot.send_message(chat_id=user.id, text=processing_message_text); message_to_delete_later_id = status_message.message_id; logger.debug(f"Sent new status message {message_to_delete_later_id}")
473
- except Exception as send_err: logger.error(f"Failed sending new status message: {send_err}")
474
- content = None; user_feedback_message = None; success = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  try:
476
- logger.debug(f"Sending 'typing' action for chat {user.id}"); await context.bot.send_chat_action(chat_id=user.id, action='typing')
477
- is_yt = is_youtube_url(url); logger.debug(f"URL is YouTube: {is_yt}")
 
 
 
 
 
 
 
 
 
478
  if is_yt:
479
  video_id = extract_youtube_id(url)
480
  if video_id:
481
- logger.info(f"Fetching YT transcript: {video_id}"); content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
482
- if not content: user_feedback_message = "Sorry, couldn't get YT transcript (unavailable/private/no captions?)."
483
- logger.info(f"YT transcript fetch done. Found: {bool(content)}")
484
- else: logger.warning(f"Failed YT ID extraction: {url}"); user_feedback_message = "Sorry, couldn't parse YT video ID."
 
 
 
 
 
 
485
  else:
486
- logger.info(f"Scraping website (Requests/BS4): {url}"); content = await get_website_content_via_requests(url)
487
- if content: logger.info("Website scrape (Requests/BS4) OK."); user_feedback_message = None
 
 
 
 
488
  else:
489
- logger.warning(f"Website scrape failed for {url}. Trying URLToText API.");
490
  if current_urltotext_key:
491
- await context.bot.send_chat_action(chat_id=user.id, action='typing'); content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
492
- if content: logger.info("URLToText API scrape OK."); user_feedback_message = None
493
- else: logger.warning(f"URLToText scrape failed for {url}."); user_feedback_message = "Sorry, couldn't fetch web content (both methods)."
494
- else: logger.warning("URLToText key not configured."); user_feedback_message = "Sorry, couldn't fetch web content (fallback not configured)."
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  if content:
496
- logger.info("Content found, generating summary."); await context.bot.send_chat_action(chat_id=user.id, action='typing')
 
 
 
 
497
  summary = await generate_summary(content, summary_type, current_openrouter_key)
498
- if summary.startswith("Error:") or summary.startswith("Sorry,"): user_feedback_message = summary; logger.warning(f"Summary generation failed: {summary}")
499
- else: logger.info("Summary generated OK. Sending."); await context.bot.send_message(chat_id=user.id, text=summary, parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True}); success = True; user_feedback_message = None
500
- elif not user_feedback_message: user_feedback_message = "Sorry, couldn't retrieve content from link."
501
- if user_feedback_message and not success: logger.warning(f"Sending failure feedback: {user_feedback_message}"); await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  except Exception as e:
503
- logger.error(f"Unexpected error in callback processing: {e}", exc_info=True);
504
- try: await context.bot.send_message(chat_id=user.id, text="Oops! Internal error processing request.")
505
- except Exception as final_err: logger.error(f"Failed sending final error message: {final_err}")
 
 
 
 
 
 
 
 
 
 
506
  finally:
507
- logger.debug("Cleaning up status message...");
 
508
  try:
509
- if message_to_delete_later_id: await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later_id); logger.debug("Deleted separate status msg.")
510
- elif query.message: await query.delete_message(); logger.debug(f"Deleted original message query {query.id}.")
511
- except Exception as del_e: logger.warning(f"Could not delete status/button message: {del_e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
514
  """Log Errors caused by Updates."""
515
  logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
 
 
 
 
 
 
516
 
517
- # --- Bot Setup Function (Modified: Configure HTTPX Client) ---
518
  async def setup_bot_config() -> Application:
519
  """Configures the PTB Application with custom HTTPX settings."""
520
  logger.info("Configuring Telegram Application...")
@@ -522,44 +881,58 @@ async def setup_bot_config() -> Application:
522
  logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
523
  raise ValueError("TELEGRAM_TOKEN environment variable not set.")
524
 
525
- # --- Configure HTTPX client settings via individual parameters ---
526
- # Increased read/pool timeouts slightly, connect timeout standard.
527
- # No direct way to set keepalive_expiry via HTTPXRequest in PTB v20.
528
- connect_timeout = 5.0
529
- read_timeout = 15.0 # Slightly increased
530
- write_timeout = 15.0 # Matching read timeout
531
- pool_timeout = 15.0 # Slightly increased pool timeout
532
 
533
- logger.info(f"Creating PTB HTTPXRequest with timeouts: "
534
- f"connect={connect_timeout}, read={read_timeout}, "
535
- f"write={write_timeout}, pool={pool_timeout}")
 
 
 
 
 
 
 
 
536
 
537
  # Create a custom request object with these settings
538
- # connection_pool_size default is 10, which is usually fine.
539
  custom_request = HTTPXRequest(
540
  connect_timeout=connect_timeout,
541
  read_timeout=read_timeout,
542
- write_timeout=write_timeout, # Added write_timeout
543
  pool_timeout=pool_timeout,
544
- http_version="1.1"
 
545
  )
546
 
547
  # Use Application.builder() and pass the custom request object
548
  application_builder = Application.builder().token(TELEGRAM_TOKEN)
549
  application_builder.request(custom_request)
550
- application_builder.get_updates_request(custom_request) # Apply same settings for getUpdates if used
 
 
 
551
 
552
  # Build the application instance
553
  application = application_builder.build()
554
 
555
- # --- Register Handlers (same as before) ---
556
  application.add_handler(CommandHandler("start", start))
557
  application.add_handler(CommandHandler("help", help_command))
 
558
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
 
559
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
 
560
  application.add_error_handler(error_handler)
561
 
562
- logger.info("Telegram handlers configured.")
563
  return application
564
 
565
  # --- ASGI Lifespan Context Manager ---
@@ -567,111 +940,198 @@ async def setup_bot_config() -> Application:
567
  async def lifespan(app: Starlette):
568
  """Handles PTB startup and shutdown during ASGI lifespan."""
569
  global ptb_app
570
- logger.info("ASGI Lifespan: Startup commencing...")
571
- loop = asyncio.get_running_loop()
572
 
573
  try:
 
574
  ptb_app = await setup_bot_config()
575
- logger.info("PTB App configured. Initializing...")
576
- await ptb_app.initialize()
577
- logger.info("PTB App initialized. Starting background tasks...")
 
578
  await ptb_app.start()
579
- logger.info(f"PTB App started. Bot details: {ptb_app.bot.username}")
 
 
 
580
 
 
 
581
  WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
582
  if WEBHOOK_URL_BASE:
 
583
  if not WEBHOOK_URL_BASE.startswith("https://"): WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
584
- webhook_path = "/webhook"
585
  full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
586
- logger.info(f"Attempting to set webhook to: {full_webhook_url}")
 
 
 
587
  try:
588
- await asyncio.sleep(1.5) # Small delay before setting webhook
589
- await ptb_app.bot.set_webhook(url=full_webhook_url, allowed_updates=Update.ALL_TYPES)
590
- webhook_info = await ptb_app.bot.get_webhook_info()
591
- logger.info(f"Webhook set successfully! Info: {webhook_info}")
 
 
 
 
 
 
 
 
 
592
  except RetryAfter as e:
593
- logger.warning(f"Webhook set failed due to rate limit (RetryAfter: {e.retry_after}s). Another worker likely succeeded.")
 
 
 
 
 
594
  except Exception as e:
595
- logger.error(f"Failed to set webhook: {e}", exc_info=True)
596
- else: logger.warning("SPACE_HOST env variable not found. Skipping webhook setup.")
 
597
 
598
- logger.info("ASGI Lifespan: Startup complete. Application ready.")
599
- yield # Application runs here
600
 
601
  except Exception as startup_err:
602
- logger.critical(f"CRITICAL ERROR during ASGI startup: {startup_err}", exc_info=True)
 
603
  raise
604
  finally:
605
- logger.info("ASGI Lifespan: Shutdown commencing...")
606
- if ptb_app and ptb_app._running: # Use _running
607
- try:
608
- logger.info("Stopping PTB App...")
609
- await ptb_app.stop()
610
- logger.info("Shutting down PTB App...")
611
- await ptb_app.shutdown()
612
- logger.info("PTB App shut down successfully.")
613
- except Exception as shutdown_err:
614
- logger.error(f"Error during PTB shutdown: {shutdown_err}", exc_info=True)
615
- elif ptb_app: logger.warning("PTB App instance exists but was not running at shutdown.")
616
- else: logger.warning("No PTB App instance found at shutdown.")
 
 
 
 
 
 
 
 
 
 
 
 
617
  logger.info("ASGI Lifespan: Shutdown complete.")
618
 
619
 
620
- # --- Flask App Setup (for Routes) ---
 
 
621
  flask_core_app = Flask(__name__)
622
- logger.info("Core Flask app instance created (for routing via Starlette).")
623
 
624
- # --- Define Flask Routes on flask_core_app ---
625
  @flask_core_app.route('/')
626
  def index():
627
  """Basic health check endpoint."""
628
- logger.debug("Health check '/' accessed.")
629
- bot_status = "UNKNOWN"
630
- if ptb_app: bot_status = "Running" if ptb_app._running else "Initialized/Stopped/Starting/Error" # Use _running
631
- else: bot_status = "Not Initialized"
632
- return f"Telegram Bot Webhook Listener ({bot_status}) running via Starlette."
 
 
633
 
634
  @flask_core_app.route('/webhook', methods=['POST'])
635
  async def webhook() -> Response:
636
- """Webhook endpoint for Telegram updates."""
 
 
637
  if not ptb_app:
638
- logger.error("Webhook triggered, but PTB Application instance (ptb_app) is None.")
639
- return Response('Bot service not configured.', status=503)
640
- if not ptb_app._running: # Use _running
641
- logger.error("Webhook triggered, but PTB Application is Not Running.")
642
- return Response('Bot service not ready.', status=503)
643
-
644
- logger.debug("Webhook request received (POST)...")
645
- if request.is_json:
646
- try:
647
- update_data = request.get_json()
648
- update = Update.de_json(update_data, ptb_app.bot)
649
- logger.debug(f"Processing update ID: {update.update_id} via webhook")
650
- await ptb_app.process_update(update)
651
- logger.debug(f"Finished processing update ID: {update.update_id}")
652
- return Response('ok', status=200)
653
- except json.JSONDecodeError: logger.error("Failed JSON decode from Telegram."); return Response('Bad Request: Invalid JSON', status=400)
654
- except Exception as e: logger.error(f"Error processing update in webhook handler: {e}", exc_info=True); return Response('Internal Server Error processing update.', status=500)
655
- else: logger.warning("Received non-JSON request to webhook."); return Response('Bad Request: Expected JSON', status=400)
656
-
657
-
658
- # --- Create Starlette App with Lifespan & Mount Flask ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  app = Starlette(
660
- lifespan=lifespan,
 
661
  routes=[
 
 
662
  Mount("/", app=WSGIMiddleware(flask_core_app))
663
  ]
664
  )
665
- logger.info("Starlette application created with lifespan and Flask app mounted at '/'.")
666
 
667
 
668
- # --- Main Execution Block (for local testing ONLY) ---
 
 
 
669
  if __name__ == '__main__':
670
- logger.warning("Running Flask development server directly (LOCAL TESTING ONLY).")
671
- logger.warning("NOTE: This mode does NOT initialize PTB via ASGI lifespan.")
672
- logger.warning("Use 'uvicorn main:app --reload --port 8080' for proper local ASGI testing.")
673
- if not TELEGRAM_TOKEN: logger.critical("Aborting local Flask start: TELEGRAM_TOKEN missing.")
 
 
 
 
 
 
 
 
 
674
  else:
 
675
  local_port = int(os.environ.get('PORT', 8080))
676
- logger.info(f"Flask dev server starting on http://0.0.0.0:{local_port}")
677
- flask_core_app.run(host='0.0.0.0', port=local_port, debug=True)
 
 
 
1
+ # main.py (Revised: Increased Pool/Timeouts + Robust Callback Handling)
2
  import os
3
  import re
4
  import logging
 
24
  CallbackQueryHandler,
25
  )
26
  from telegram.constants import ParseMode
27
+ from telegram.error import NetworkError, RetryAfter, TimedOut # Import TimedOut
28
  from telegram.request import HTTPXRequest # Import the request class
29
 
30
  # --- Other Libraries ---
31
+ import httpx # <<<--- ADDED IMPORT for httpx.Limits
32
  from youtube_transcript_api import YouTubeTranscriptApi
33
  import requests
34
  from bs4 import BeautifulSoup
 
43
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
44
  level=logging.DEBUG
45
  )
46
+ # Reduce log spam from libraries
47
  logging.getLogger("httpx").setLevel(logging.WARNING)
48
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
49
  logging.getLogger("telegram.ext").setLevel(logging.INFO)
 
52
  logging.getLogger('gunicorn.error').setLevel(logging.INFO)
53
  logging.getLogger('uvicorn').setLevel(logging.INFO)
54
  logging.getLogger('starlette').setLevel(logging.INFO)
55
+ # Keep our app logger at DEBUG
56
  logger = logging.getLogger(__name__)
57
  logger.info("Logging configured.")
58
 
 
62
  # --- Environment Variable Loading ---
63
  logger.info("Attempting to load secrets...")
64
  def get_secret(secret_name):
65
+ # logger.debug(f"Attempting to read secret: {secret_name}") # Optional: Less verbose startup
66
  value = os.environ.get(secret_name)
67
  if value: logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})")
68
  else: logger.warning(f"Secret '{secret_name}': Not Found")
 
81
  # get_transcript_via_supadata, get_transcript_via_apify,
82
  # get_youtube_transcript, get_website_content_via_requests,
83
  # get_website_content_via_urltotext_api, generate_summary)
 
84
 
85
  # Helper Functions
86
  def is_youtube_url(url):
 
112
  params = {"videoId": video_id, "format": "text"}
113
  headers = {"X-API-Key": api_key}
114
  try:
115
+ # Consider removing verify=False if possible, or manage certificates properly
116
+ logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification - Potential Security Risk)")
117
  response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
118
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
119
  if response.status_code == 200:
 
255
  except Exception as e:
256
  logger.warning(f"[Primary YT] Error via library: {type(e).__name__} - {e}")
257
  if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e): logger.warning("[Primary YT] IP likely blocked.")
258
+ elif "No transcript found" in str(e): logger.warning("[Primary YT] No transcript in specified languages.")
259
+ elif "TranscriptsDisabled" in str(e) or "disabled" in str(e): logger.warning("[Primary YT] Transcripts disabled for this video.")
260
+ transcript_text = None # Ensure it's None on error
261
 
262
  if transcript_text is None: # Fallback 1: Supadata
263
  logger.info("[Fallback YT 1] Trying Supadata API...")
264
  if supadata_key:
265
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
266
  if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata. Length: {len(transcript_text)}"); return transcript_text
267
+ else: logger.warning("[Fallback YT 1] Supadata failed or no content found.")
268
+ else: logger.warning("[Fallback YT 1] Supadata key not available, skipping.")
269
 
270
  if transcript_text is None: # Fallback 2: Apify
271
  logger.info("[Fallback YT 2] Trying Apify API...")
272
  if apify_token:
273
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
274
  if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify. Length: {len(transcript_text)}"); return transcript_text
275
+ else: logger.warning("[Fallback YT 2] Apify failed or no content found.")
276
+ else: logger.warning("[Fallback YT 2] Apify token not available, skipping.")
277
 
278
+ if transcript_text is None: logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
279
  return transcript_text
280
 
281
  # Website Content via Requests/BS4
282
  async def get_website_content_via_requests(url):
283
  """Attempts to scrape website content using requests/BeautifulSoup."""
284
+ if not url: logger.error("[Web Scraper - Requests/BS4] get_website_content_via_requests called with no URL"); return None
285
+ logger.info(f"[Web Scraper - Requests/BS4] Attempting fetch: {url}")
286
  try:
287
+ # Standard headers, avoid overly aggressive scraping patterns
288
+ headers = {
289
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', # Updated UA
290
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
291
+ 'Accept-Language': 'en-US,en;q=0.9',
292
+ 'Connection': 'keep-alive',
293
+ 'DNT': '1', # Do Not Track header
294
+ 'Upgrade-Insecure-Requests': '1'
295
+ }
296
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
297
+ response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
298
  logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
299
+
300
  content_type = response.headers.get('content-type', '').lower()
301
  if 'html' not in content_type:
302
+ logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received: {content_type}. Attempting plain text extraction.")
303
+ # Allow plain text only if explicitly text/plain
304
+ if 'text/plain' in content_type and response.text:
305
+ logger.info(f"[Web Scraper - Requests/BS4] Extracted plain text content. Length: {len(response.text.strip())}")
306
+ return response.text.strip()
307
+ logger.warning(f"[Web Scraper - Requests/BS4] Content type '{content_type}' not suitable for parsing. Aborting.")
308
  return None
309
+
310
  soup = BeautifulSoup(response.text, 'html.parser')
311
+
312
+ # Remove common non-content tags more aggressively
313
+ tags_to_remove = ["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio", "picture", "source"]
314
+ # Also remove elements often used for ads or menus by class/id
315
+ selectors_to_remove = ['.ad', '#ad', '.ads', '#ads', '.advertisement', '#advertisement', '.banner', '#banner', '.menu', '#menu', '.navigation', '#navigation', '.sidebar', '#sidebar', '.social', '#social', '.share', '#share', '.related', '#related', '.comments', '#comments', '.cookie-consent', '#cookie-consent']
316
+
317
+ for tag in soup(tags_to_remove): tag.decompose()
318
+ for selector in selectors_to_remove:
319
+ for element in soup.select(selector): element.decompose()
320
+
321
+ # Try to find semantic main content areas first
322
+ main_content = soup.find('main') or \
323
+ soup.find('article') or \
324
+ soup.find(id='content') or \
325
+ soup.find(class_='content') or \
326
+ soup.find(id='main-content') or \
327
+ soup.find(class_='main-content') or \
328
+ soup.find(role='main')
329
+
330
  target_element = main_content if main_content else soup.body
331
+ if not target_element:
332
+ logger.warning(f"[Web Scraper - Requests/BS4] Could not find a suitable target element (main, article, body) for {url}");
333
+ return None
334
+
335
+ # Extract text, attempting to preserve paragraphs better
336
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
337
+ text = "\n\n".join(lines) # Join lines with double newline for paragraph separation
338
+
339
+ MIN_TEXT_LENGTH = 100 # Increased minimum length
340
+ if not text or len(text) < MIN_TEXT_LENGTH:
341
+ logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is too short (<{MIN_TEXT_LENGTH} chars) after cleaning for {url}. Length: {len(text)}. Content might be JS-rendered or blocked.")
342
+ # Optional: Log the short text for debugging: logger.debug(f"Short text: {text[:500]}")
343
+ return None # Treat very short text as failure
344
+
345
+ logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped and cleaned content from {url}. Final Length: {len(text)}")
346
  return text
347
+
348
+ except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout error fetching {url}"); return None
349
+ except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error for {url}"); return None
350
+ except requests.exceptions.HTTPError as e: logger.error(f"[Web Scraper - Requests/BS4] HTTP error {e.response.status_code} for {url}"); return None
351
+ except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] General request error for {url}: {e}"); return None
352
+ except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Error during parsing or processing {url}: {e}", exc_info=True); return None
353
 
354
  # Website Content via URLToText API
355
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
356
  """Fetches website content using the URLToText API."""
357
+ if not url: logger.error("[Web Scraper - URLToText API] get_website_content_via_urltotext_api called with no URL"); return None
358
+ if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
359
+ logger.info(f"[Web Scraper - URLToText API] Attempting fetch via API: {url}")
360
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
361
+ # Ensure payload includes options beneficial for scraping modern sites
362
+ payload = json.dumps({
363
+ "url": url,
364
+ "output_format": "text",
365
+ "extract_main_content": True, # Try to get just the core article/content
366
+ "render_javascript": True, # Crucial for JS-heavy sites
367
+ "residential_proxy": False, # Set to True if facing blocks, requires appropriate plan
368
+ "timeout_render": 20000, # Increase JS render timeout (in ms)
369
+ })
370
  headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
371
  try:
372
+ response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=60) # Increased overall timeout
373
+ logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
374
  if response.status_code == 200:
375
  try:
376
  data = response.json()
377
+ content_data = data.get("data", {})
378
+ content = content_data.get("content")
379
+ credits = data.get("credits_used", "N/A")
380
+ warning = content_data.get("warning")
381
+ error_msg = content_data.get("error") # Check for specific error in response data
382
+
383
+ if warning: logger.warning(f"[Web Scraper - URLToText API] API Warning for {url}: {warning}")
384
+ if error_msg: logger.error(f"[Web Scraper - URLToText API] API Error reported for {url}: {error_msg}"); return None # Treat API error as failure
385
+
386
+ if content and isinstance(content, str):
387
+ logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API. Length: {len(content.strip())}. Credits Used: {credits}");
388
+ return content.strip()
389
+ else:
390
+ logger.warning(f"[Web Scraper - URLToText API] API returned status 200 but content is empty or invalid for {url}. Response: {data}");
391
+ return None
392
+ except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response from API. Status: {response.status_code}. Response Text: {response.text[:500]}..."); return None
393
+ except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing successful API response: {e}", exc_info=True); return None
394
+ elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400) to API. Check payload/URL. Response: {response.text[:200]}...")
395
+ elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401). Check API Key. Response: {response.text[:200]}...")
396
+ elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402). Check API credits/plan. Response: {response.text[:200]}...")
397
+ elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL / Fetch Error (422) reported by API for {url}. Response: {response.text[:200]}...") # Might mean the site blocked the API
398
+ elif response.status_code == 429: logger.warning(f"[Web Scraper - URLToText API] Rate Limit Hit (429). Response: {response.text[:200]}...")
399
+ elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] API Server Error ({response.status_code}). Response: {response.text[:200]}...")
400
+ else: logger.error(f"[Web Scraper - URLToText API] Unexpected status code {response.status_code} from API. Response: {response.text[:200]}...")
401
+ return None # Return None for all non-200 responses after logging
402
+ except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout connecting to API for {url}"); return None
403
+ except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API: {e}"); return None
404
+ except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call: {e}", exc_info=True); return None
405
 
406
  # DeepSeek Summary Function (with updated prompts)
407
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
408
  """Generates summary using DeepSeek via OpenRouter API."""
409
  logger.info(f"Generating '{summary_type}' summary. Input length: {len(text)}")
410
+ if not api_key: logger.error("OpenRouter API key missing."); return "Error: AI service configuration key is missing."
411
+ if not text or not text.strip(): logger.warning("generate_summary called with empty or whitespace-only text."); return "Error: No content was provided to summarize."
412
+
413
+ openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
414
+ # Consider using a non-free model if rate limits are hit or quality needed
415
+ model_name = "deepseek/deepseek-chat:free"
416
+ # model_name = "openai/gpt-3.5-turbo" # Example alternative
417
 
418
  # --- UPDATED PROMPTS ---
419
  if summary_type == "paragraph":
420
+ system_message = (
421
+ "You are an expert summarization AI. Your goal is to provide a concise, easy-to-understand summary of the provided text. "
422
+ "Follow these instructions precisely:\n"
423
+ "1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
424
+ "2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
425
+ "3. **Format:** Output a single paragraph only.\n"
426
+ "4. **Conciseness:** The summary must be **no more than 85 words** long.\n"
427
+ "5. **Completeness:** Cover the main points from the entire text, not just the start.\n"
428
+ "6. **Punctuation:** Do NOT use em dashes (– or —). Use semicolons (;) if needed for complex sentence structure, but prefer simpler sentences.\n"
429
+ "7. **Tone:** Maintain a neutral and informative tone.\n"
430
+ "8. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
431
  )
432
+ user_prompt_instruction = "Summarize the following text into a single paragraph adhering strictly to the rules outlined in the system message:"
433
+
434
  elif summary_type == "points":
435
+ system_message = (
436
+ "You are an expert summarization AI. Your goal is to extract the key points from the provided text and present them as a bulleted list. "
437
+ "Follow these instructions precisely:\n"
438
+ "1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
439
+ "2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
440
+ "3. **Format:** Output as a bulleted list. Start each point with a standard bullet character ('*' or '-'). Each point should be distinct and on a new line.\n"
441
+ "4. **Content:** Each bullet point should represent a single key finding, main topic, or significant piece of information from the text.\n"
442
+ "5. **Conciseness:** Keep each bullet point brief and to the point.\n"
443
+ "6. **Completeness:** Cover the main points from the entire text, not just the start.\n"
444
+ "7. **Punctuation:** Do NOT use em dashes (– or —) within bullet points.\n"
445
+ "8. **Tone:** Maintain a neutral and informative tone.\n"
446
+ "9. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
447
  )
448
+ user_prompt_instruction = "Summarize the following text into a bulleted list adhering strictly to the rules outlined in the system message:"
449
  else:
450
+ logger.error(f"Invalid summary_type '{summary_type}' requested.")
451
+ return f"Error: Invalid summary type ('{summary_type}') requested. Please choose 'paragraph' or 'points'."
452
  # --- END UPDATED PROMPTS ---
453
 
454
+ # Practical limit for API context window / cost control
455
+ # Deepseek context might be larger, but set a reasonable app limit
456
+ MAX_INPUT_TOKENS_ESTIMATE = 28000 # Rough estimate for deepseek-chat's context limit (aim lower than max)
457
+ # Simple character length heuristic (adjust based on typical content)
458
+ AVG_CHARS_PER_TOKEN = 4
459
+ MAX_INPUT_LENGTH = MAX_INPUT_TOKENS_ESTIMATE * AVG_CHARS_PER_TOKEN
460
+
461
+ if len(text) > MAX_INPUT_LENGTH:
462
+ logger.warning(f"Input text length ({len(text)} chars) exceeds estimated limit ({MAX_INPUT_LENGTH}). Truncating.")
463
+ truncation_marker = "\n\n[... Text truncated due to length ...]"
464
+ text = text[:MAX_INPUT_LENGTH - len(truncation_marker)] + truncation_marker
465
+
466
+ # Construct the messages payload for the API
467
+ messages = [
468
+ {"role": "system", "content": system_message},
469
+ {"role": "user", "content": f"{user_prompt_instruction}\n\n--- TEXT TO SUMMARIZE ---\n\n{text}\n\n--- END OF TEXT ---"}
470
+ ]
471
 
472
+ # Referer and Title for OpenRouter identification
473
+ space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME") # Replace default if needed
474
+ referer_url = f"https://{space_host}" if space_host and not space_host.startswith("http") else space_host or "https://huggingface.co"
475
+ headers = {
476
+ "Authorization": f"Bearer {api_key}",
477
+ "Content-Type": "application/json",
478
+ "HTTP-Referer": referer_url,
479
+ "X-Title": "Telegram URL Summarizer Bot" # Or your bot's name
480
+ }
481
+ payload = json.dumps({"model": model_name, "messages": messages})
482
 
483
  try:
484
+ logger.debug(f"Sending request to OpenRouter (Model: {model_name}). Prompt length approx: {len(text)} chars.")
485
+ # Increased timeout for potentially long AI generation
486
+ response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=120)
487
  logger.debug(f"Received status {response.status_code} from OpenRouter.")
488
+
489
  if response.status_code == 200:
490
  try:
491
  data = response.json()
492
+ # Check for response structure variations
493
+ choice = data.get("choices", [{}])[0]
494
+ message = choice.get("message", {})
495
+ summary = message.get("content")
496
+ finish_reason = choice.get("finish_reason")
497
+
498
+ if summary and isinstance(summary, str) and summary.strip():
499
+ summary = summary.strip()
500
+ logger.info(f"Successfully generated summary. Finish Reason: {finish_reason}. Length: {len(summary)}")
501
+ # Optional: Add post-processing checks (e.g., length for paragraph)
502
+ if summary_type == "paragraph" and len(summary.split()) > 95: # Allow slight overrun from 85 words
503
+ logger.warning(f"Generated paragraph summary slightly longer than target word count ({len(summary.split())} words).")
504
+ return summary
505
+ else:
506
+ logger.warning(f"OpenRouter returned status 200 but summary content is missing or empty. Response data: {data}")
507
+ return "Sorry, the AI model returned an empty summary. The content might have been unsuitable."
508
+
509
+ except (json.JSONDecodeError, IndexError, KeyError, AttributeError) as e:
510
+ logger.error(f"Failed to parse successful (200) response from OpenRouter. Error: {e}. Response Text: {response.text[:500]}...", exc_info=True)
511
+ return "Sorry, there was an issue parsing the response from the AI service."
512
+ except Exception as e:
513
+ logger.error(f"Unexpected error processing OpenRouter success response: {e}", exc_info=True)
514
+ return "Sorry, an unexpected error occurred while processing the AI response."
515
+
516
+ # Handle specific HTTP error codes from OpenRouter
517
+ elif response.status_code == 401: logger.error("OpenRouter API key is invalid (Unauthorized - 401)."); return "Error: AI service authentication failed. Please check the configuration."
518
+ elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check credits/limits."); return "Sorry, there's an issue with the AI service account limits or payment."
519
+ elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Hit (429)."); return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
520
+ elif response.status_code == 400: logger.error(f"OpenRouter Bad Request (400). Likely prompt issue. Response: {response.text[:500]}..."); return "Sorry, the request to the AI service was invalid (possibly due to the content or prompt)."
521
+ elif response.status_code >= 500: logger.error(f"OpenRouter Server Error ({response.status_code}). Response: {response.text[:500]}..."); return "Sorry, the AI service is experiencing internal issues. Please try again later."
522
  else:
523
+ # Handle other unexpected errors
524
+ logger.error(f"Unexpected HTTP status {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
525
+ try: # Try to extract an error message from the response body
526
+ error_data = response.json()
527
+ error_msg = error_data.get("error", {}).get("message", response.text[:100])
528
+ return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
529
+ except json.JSONDecodeError:
530
+ return f"Sorry, the AI service returned an unexpected error (Status: {response.status_code})."
531
+
532
+ except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter API."); return "Sorry, the request to the AI model timed out. Please try again."
533
+ except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was a network error connecting to the AI model service."
534
+ except Exception as e: logger.error(f"Unexpected error occurred within generate_summary function: {e}", exc_info=True); return "Sorry, an unexpected internal error occurred while generating the summary."
535
 
536
 
537
  # --- Telegram Bot Handlers ---
538
 
539
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
540
+ """Handles the /start command."""
541
+ user = update.effective_user
542
+ if not user: return # Should not happen with a command
543
+ logger.info(f"User {user.id} ({user.username or 'NoUsername'}) initiated /start.")
544
+ # Use mention_html for linking username if available, otherwise just first name
545
  mention = user.mention_html() if user.username else user.first_name
546
+ start_message = (
547
+ f"👋 Hello {mention}!\n\n"
548
+ "I can summarise YouTube videos or web articles for you.\n\n"
549
+ "Just send me a link (URL) and I'll ask you whether you want the summary as a paragraph or bullet points.\n\n"
550
+ "Type /help for more details."
551
+ )
552
+ await update.message.reply_html(start_message)
553
 
554
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
555
+ """Handles the /help command."""
556
+ user = update.effective_user
557
+ logger.info(f"User {user.id if user else '?'} requested /help.")
558
+ help_text = (
559
+ "**How to Use Me:**\n"
560
+ "1. Send me a direct link (URL) to a YouTube video or a web article.\n"
561
+ "2. I will ask you to choose the summary format: `Paragraph` or `Points`.\n"
562
+ "3. Click the button for your preferred format.\n"
563
+ "4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
564
+ "**Important Notes:**\n"
565
+ "- **YouTube:** Getting transcripts can sometimes fail if they are disabled, unavailable for the video's language, or if YouTube temporarily blocks requests.\n"
566
+ "- **Websites:** I do my best to extract the main article content, but complex websites (especially those heavily reliant on JavaScript or with strong anti-scraping measures) might not work perfectly. I have a fallback service to help with tricky sites.\n"
567
+ "- **AI Summaries:** The AI tries its best to be accurate and follow the requested format, but errors or unexpected outputs are possible.\n"
568
+ "- **Length:** Very long articles or videos might be truncated before summarization to fit within processing limits.\n\n"
569
+ "Just send a link to get started!"
570
+ )
571
+ # Use MarkdownV2 for better formatting control if needed, but MARKDOWN is simpler
572
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
573
 
574
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
575
+ """Handles messages containing potential URLs."""
576
  if not update.message or not update.message.text: return
577
+ message_text = update.message.text.strip()
578
+ user = update.effective_user
579
+ if not user: return # Should not happen with a message
580
+
581
+ # More robust URL regex (handles various protocols, domains, paths, queries)
582
+ # Still simple, not aiming for perfect RFC 3986 validation
583
+ url_pattern = r'https?://(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(?:/[^\s]*)?'
584
+ match = re.search(url_pattern, message_text)
585
+
586
  if match:
587
+ url = match.group(0)
588
+ logger.info(f"User {user.id} sent potential URL: {url}")
589
+
590
+ # Store URL in user_data, associated with the user ID
591
+ context.user_data['url_to_summarize'] = url
592
+ logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
593
+
594
+ keyboard = [
595
+ [
596
+ InlineKeyboardButton("📜 Paragraph Summary", callback_data="paragraph"),
597
+ InlineKeyboardButton("🔹 Bullet Points", callback_data="points")
598
+ ]
599
+ ]
600
  reply_markup = InlineKeyboardMarkup(keyboard)
601
+
602
+ # Send message asking for summary type
603
+ await update.message.reply_text(
604
+ f"✅ Link received:\n`{url}`\n\nChoose your desired summary format:",
605
+ reply_markup=reply_markup,
606
+ parse_mode=ParseMode.MARKDOWN,
607
+ link_preview_options={'is_disabled': True} # Disable link preview for this message
608
+ )
609
+ else:
610
+ # If it doesn't look like a URL, maybe provide guidance?
611
+ # logger.debug(f"Ignoring non-URL message from {user.id}: {message_text[:100]}")
612
+ # Optional: Reply if it's not a command and not a URL
613
+ if not message_text.startswith('/'):
614
+ await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
615
+
616
 
617
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
618
+ """Handles button presses for choosing summary type."""
619
+ query = update.callback_query
620
+ if not query or not query.from_user:
621
+ logger.warning("Callback query or user missing in update.")
622
+ return # Can't proceed without query/user
623
+ user = query.from_user
624
+
625
+ # --- Answer Callback Query Immediately ---
626
+ try:
627
+ await query.answer() # Acknowledge the button press
628
+ logger.debug(f"Answered callback query {query.id} for user {user.id}")
629
+ except TimedOut:
630
+ # Log timeout but proceed; the button loading indicator might just hang for the user
631
+ logger.warning(f"Timeout answering callback query {query.id} for user {user.id}. Processing continues.")
632
+ except Exception as e:
633
+ # Log other errors but proceed cautiously. The button might remain "loading".
634
+ logger.error(f"Error answering callback query {query.id} for user {user.id}: {e!r}", exc_info=True)
635
+
636
+ summary_type = query.data # 'paragraph' or 'points'
637
+ # Retrieve URL stored earlier for this user
638
+ url = context.user_data.get('url_to_summarize')
639
+ logger.info(f"User {user.id} chose summary type '{summary_type}'. Checking for stored URL.")
640
+
641
  if not url:
642
+ logger.warning(f"User {user.id} pressed button '{summary_type}', but NO URL found in user_data context.")
643
+ try:
644
+ # Inform user context was lost (e.g., bot restarted, long delay)
645
+ await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
646
+ except TimedOut:
647
+ logger.error(f"Timeout trying to edit message to inform user {user.id} about lost context.")
648
+ except Exception as edit_err:
649
+ # Log error if editing fails (message might already be gone, or other Telegram issue)
650
+ logger.error(f"Failed to edit message for lost context for user {user.id}: {edit_err}")
651
+ return # Stop processing if URL is missing
652
+
653
+ # --- URL Found - Proceed with Processing ---
654
+ logger.info(f"Processing URL '{url}' for user {user.id} with type '{summary_type}'.")
655
+ # Clear the URL from context now that we're processing it
656
+ context.user_data.pop('url_to_summarize', None)
657
+ logger.debug(f"Cleared URL from user_data for user {user.id}")
658
+
659
+ # Fetch current API keys (allows for potential runtime changes, though unlikely here)
660
+ current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
661
+ current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
662
+ current_supadata_key = os.environ.get('SUPADATA_API_KEY')
663
+ current_apify_token = os.environ.get('APIFY_API_TOKEN')
664
+ # Simple check log
665
+ keys_present = f"OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}"
666
+ logger.debug(f"API Key check for user {user.id} request: {keys_present}")
667
+
668
+ # Critical dependency check: AI key
669
  if not current_openrouter_key:
670
+ logger.error(f"CRITICAL: OpenRouter API key is missing. Cannot generate summary for user {user.id}.")
671
+ try:
672
+ await query.edit_message_text(text="❌ Configuration Error: The AI summarization service is not configured correctly. Please contact the administrator.")
673
+ except TimedOut:
674
+ logger.error(f"Timeout editing message to inform user {user.id} about missing AI key.")
675
+ except Exception as edit_err:
676
+ logger.error(f"Failed to edit message for missing AI key for user {user.id}: {edit_err}")
677
  return
678
+
679
+ # --- Inform User Processing Has Started ---
680
+ processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
681
+ message_to_edit = query.message # The message with the buttons
682
+ status_message_sent = None # Will hold msg ID if we send a new status message
683
+
684
+ try:
685
+ if message_to_edit:
686
+ await query.edit_message_text(text=processing_message_text)
687
+ logger.debug(f"Edited original message {message_to_edit.message_id} to show 'Working...' status for query {query.id}")
688
+ else:
689
+ # This case should be rare if query.message exists, but handle defensively
690
+ logger.warning(f"Original message (query.message) not found for query {query.id}. Cannot edit, will send new status message.")
691
+ raise ValueError("Original message object missing") # Force fallback to sending new message
692
+ except (TimedOut, Exception) as e:
693
+ # If editing fails (e.g., message too old, deleted, rate limit), try sending a new message
694
+ logger.warning(f"Could not edit original message {message_to_edit.message_id if message_to_edit else 'N/A'} for query {query.id}: {e!r}. Attempting to send a new status message.")
695
+ message_to_edit = None # Ensure we don't try to delete this later if editing failed
696
+ try:
697
+ status_message_sent = await context.bot.send_message(chat_id=user.id, text=processing_message_text)
698
+ logger.debug(f"Sent new status message {status_message_sent.message_id} to user {user.id}.")
699
+ except TimedOut:
700
+ logger.error(f"Timeout sending NEW 'Working...' status message to user {user.id}. Processing continues without feedback.")
701
+ # User won't know bot is working - proceed anyway, hope for the best.
702
+ except Exception as send_err:
703
+ logger.error(f"Failed sending NEW 'Working...' status message to user {user.id}: {send_err}. Processing continues without feedback.")
704
+ # As above.
705
+
706
+ # --- Main Content Fetching and Summarization ---
707
+ content = None
708
+ user_feedback_message = None # Holds error/status messages for the user
709
+ success = False # Tracks if we successfully sent a summary
710
+
711
  try:
712
+ # Send 'typing' action to indicate activity
713
+ try:
714
+ logger.debug(f"Sending 'typing' chat action to chat {user.id}")
715
+ await context.bot.send_chat_action(chat_id=user.id, action='typing')
716
+ except TimedOut: logger.warning(f"Timeout sending 'typing' action for user {user.id}.")
717
+ except Exception as ca_err: logger.warning(f"Failed sending 'typing' action for user {user.id}: {ca_err}")
718
+
719
+ # --- Determine Content Type and Fetch ---
720
+ is_yt = is_youtube_url(url)
721
+ logger.debug(f"URL ({url}) is YouTube: {is_yt} (User: {user.id})")
722
+
723
  if is_yt:
724
  video_id = extract_youtube_id(url)
725
  if video_id:
726
+ logger.info(f"Fetching YouTube transcript for video ID: {video_id} (User: {user.id})")
727
+ content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
728
+ if not content:
729
+ logger.warning(f"Failed to get YouTube transcript for {video_id} (User: {user.id}).")
730
+ user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video. It might be unavailable, private, have captions disabled, or an error occurred."
731
+ else:
732
+ logger.info(f"Successfully fetched YouTube transcript for {video_id}. Length: {len(content)} (User: {user.id})")
733
+ else:
734
+ logger.warning(f"Failed to extract YouTube video ID from URL: {url} (User: {user.id})")
735
+ user_feedback_message = "⚠️ Sorry, I couldn't identify a valid YouTube video ID in the link you provided."
736
  else:
737
+ # --- Website Scraping ---
738
+ logger.info(f"Attempting website scrape (Requests/BS4) for URL: {url} (User: {user.id})")
739
+ content = await get_website_content_via_requests(url)
740
+ if content:
741
+ logger.info(f"Website scrape successful (Requests/BS4). Length: {len(content)} (User: {user.id})")
742
+ # Content found, no need for feedback message yet
743
  else:
744
+ logger.warning(f"Primary website scrape failed for {url} (User: {user.id}). Trying fallback API.")
745
  if current_urltotext_key:
746
+ # Send typing again if first scrape failed and we try another method
747
+ try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before fallback scrape.")
748
+ except: pass # Ignore if fails
749
+
750
+ logger.info(f"Attempting website scrape via URLToText API for: {url} (User: {user.id})")
751
+ content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
752
+ if content:
753
+ logger.info(f"Website scrape successful via URLToText API. Length: {len(content)} (User: {user.id})")
754
+ else:
755
+ logger.warning(f"Fallback website scrape (URLToText API) also failed for {url} (User: {user.id}).")
756
+ user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website using available methods. It might be protected or structured in a way I can't parse."
757
+ else:
758
+ # Fallback key missing
759
+ logger.warning(f"Primary scrape failed and URLToText API key not configured. Cannot fallback for {url} (User: {user.id}).")
760
+ user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website, and the fallback service isn't configured."
761
+
762
+ # --- Generate Summary if Content Was Fetched ---
763
  if content:
764
+ logger.info(f"Content fetched (Length: {len(content)}). Generating '{summary_type}' summary for user {user.id}.")
765
+ # Send typing before potentially long AI call
766
+ try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before AI summary generation.")
767
+ except: pass
768
+
769
  summary = await generate_summary(content, summary_type, current_openrouter_key)
770
+
771
+ # Check if summary generation returned an error message
772
+ if summary.startswith("Error:") or summary.startswith("Sorry,"):
773
+ logger.warning(f"AI summary generation failed for user {user.id}. Reason: {summary}")
774
+ user_feedback_message = f"⚠️ {summary}" # Use the error message from generate_summary
775
+ else:
776
+ # --- Summary Success - Send to User ---
777
+ logger.info(f"Summary generated successfully for user {user.id}. Length: {len(summary)}. Sending result.")
778
+ try:
779
+ await context.bot.send_message(
780
+ chat_id=user.id,
781
+ text=summary,
782
+ parse_mode=ParseMode.MARKDOWN, # Assuming AI generates markdown for points
783
+ link_preview_options={'is_disabled': True}
784
+ )
785
+ success = True
786
+ user_feedback_message = None # Clear any previous fetching error message
787
+ logger.info(f"Successfully sent summary to user {user.id}.")
788
+ except TimedOut:
789
+ logger.error(f"Timeout sending final summary message to user {user.id}.")
790
+ user_feedback_message = "⚠️ Sorry, there was a timeout while trying to send you the final summary."
791
+ success = False # Mark as failed if sending timed out
792
+ except Exception as send_final_err:
793
+ logger.error(f"Failed sending final summary to user {user.id}: {send_final_err}", exc_info=True)
794
+ user_feedback_message = "⚠️ Sorry, an unexpected error occurred while sending the final summary."
795
+ success = False # Mark as failed
796
+
797
+ elif not user_feedback_message:
798
+ # If content is None, but no specific error message was set above, set a generic one.
799
+ logger.warning(f"Content retrieval resulted in None, but no specific user feedback message was set. URL: {url} (User: {user.id})")
800
+ user_feedback_message = "⚠️ Sorry, I couldn't retrieve any usable content from the link provided."
801
+
802
+ # --- Send Final Feedback Message if Processing Failed ---
803
+ if user_feedback_message and not success:
804
+ logger.warning(f"Processing failed or summary sending failed for user {user.id}. Sending feedback: {user_feedback_message}")
805
+ try:
806
+ await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
807
+ except TimedOut:
808
+ logger.error(f"Timeout sending final FAILURE feedback message to user {user.id}.")
809
+ except Exception as send_feedback_err:
810
+ logger.error(f"Failed sending final FAILURE feedback message to user {user.id}: {send_feedback_err}")
811
+
812
  except Exception as e:
813
+ # Catch-all for unexpected errors during the main processing block
814
+ logger.error(f"Unexpected critical error during callback processing for user {user.id}, URL {url}: {e}", exc_info=True)
815
+ try:
816
+ # Send a generic error message to the user
817
+ await context.bot.send_message(chat_id=user.id, text="❌ Oops! An unexpected internal error occurred while processing your request. The issue has been logged.")
818
+ except TimedOut:
819
+ logger.error(f"Timeout sending CRITICAL internal error feedback message to user {user.id}.")
820
+ except Exception as final_err:
821
+ # If even sending the error message fails, log it.
822
+ logger.error(f"Failed sending CRITICAL internal error feedback message to user {user.id}: {final_err}")
823
+ # Ensure success is False if we hit this block
824
+ success = False
825
+
826
  finally:
827
+ # --- Clean up Status Message(s) ---
828
+ logger.debug(f"Cleaning up status message(s) for user {user.id}, query {query.id}. Success={success}")
829
  try:
830
+ if status_message_sent:
831
+ # If we sent a separate "Working..." message, delete it regardless of success/failure
832
+ # as the final result or error message has been (or attempted to be) sent.
833
+ await context.bot.delete_message(chat_id=user.id, message_id=status_message_sent.message_id)
834
+ logger.debug(f"Deleted separate status message {status_message_sent.message_id} for user {user.id}.")
835
+ elif message_to_edit:
836
+ # If we edited the original message with the buttons...
837
+ if success:
838
+ # If processing succeeded, delete the "Working..." message.
839
+ await query.delete_message()
840
+ logger.debug(f"Processing succeeded. Deleted original (edited) message {message_to_edit.message_id} for query {query.id}.")
841
+ else:
842
+ # If processing failed, *don't* delete the message.
843
+ # It either still shows "Working..." (if sending final error failed)
844
+ # or it might show an error message if edit_message_text was used for that.
845
+ # Let's try to edit it one last time to show a generic failure if no specific feedback was sent.
846
+ # This is complex, maybe just leave it as is for simplicity.
847
+ logger.debug(f"Processing failed. Leaving edited message {message_to_edit.message_id} in place for query {query.id}.")
848
+ # Optional: Try one last edit to show failure if needed, but might be overkill
849
+ # if not user_feedback_message: # Only if no other error was sent
850
+ # try: await query.edit_message_text("❌ Processing failed.")
851
+ # except: pass # Ignore errors here
852
+
853
+ # If message_to_edit was None (original edit failed) and status_message_sent was None (sending new status failed), there's nothing to delete here.
854
+
855
+ except TimedOut:
856
+ logger.warning(f"Timeout attempting to delete status/button message for user {user.id}, query {query.id}.")
857
+ except Exception as del_e:
858
+ # Log deletion errors as warnings, not critical if cleanup fails.
859
+ # Common error: message already deleted or trying to delete too late.
860
+ logger.warning(f"Could not delete status/button message for user {user.id}, query {query.id}: {del_e!r}")
861
+
862
+ # Log the completion of the callback handling
863
+ logger.info(f"Finished handling callback query {query.id} for user {user.id}. Overall Success: {success}")
864
+
865
 
866
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
867
  """Log Errors caused by Updates."""
868
  logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
869
+ # Add specific error type handling if needed (e.g., NetworkError, TimedOut)
870
+ if isinstance(context.error, TimedOut):
871
+ logger.warning("A timeout error occurred in PTB communication.")
872
+ elif isinstance(context.error, NetworkError):
873
+ logger.warning(f"A network error occurred: {context.error}")
874
+ # Consider notifying admin or user for specific critical errors if appropriate
875
 
876
+ # --- Bot Setup Function (Modified: Increased Pool/Timeouts) ---
877
  async def setup_bot_config() -> Application:
878
  """Configures the PTB Application with custom HTTPX settings."""
879
  logger.info("Configuring Telegram Application...")
 
881
  logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
882
  raise ValueError("TELEGRAM_TOKEN environment variable not set.")
883
 
884
+ # --- Configure HTTPX client settings ---
885
+ connect_timeout = 10.0 # Slightly higher connect timeout
886
+ # --- INCREASED TIMEOUTS AND POOL SIZE ---
887
+ read_timeout = 30.0 # Increased timeout for reading response
888
+ write_timeout = 30.0 # Increased timeout for sending request
889
+ pool_timeout = 30.0 # Increased timeout for getting connection from pool
890
+ connection_pool_size = 50 # Significantly increased pool size
891
 
892
+ logger.info(f"Creating PTB HTTPXRequest with settings: "
893
+ f"connect_timeout={connect_timeout}, read_timeout={read_timeout}, "
894
+ f"write_timeout={write_timeout}, pool_timeout={pool_timeout}, "
895
+ f"pool_size={connection_pool_size}")
896
+
897
+ # Create httpx.Limits object
898
+ custom_limits = httpx.Limits(
899
+ max_connections=connection_pool_size,
900
+ max_keepalive_connections=connection_pool_size # Keepalive same as max
901
+ # keepalive_expiry=60.0 # Optional: Keep idle connections open longer (seconds)
902
+ )
903
 
904
  # Create a custom request object with these settings
 
905
  custom_request = HTTPXRequest(
906
  connect_timeout=connect_timeout,
907
  read_timeout=read_timeout,
908
+ write_timeout=write_timeout,
909
  pool_timeout=pool_timeout,
910
+ limits=custom_limits, # Use the Limits object here
911
+ http_version="1.1" # HTTP/1.1 is usually fine, HTTP/2 might be slightly faster if supported end-to-end
912
  )
913
 
914
  # Use Application.builder() and pass the custom request object
915
  application_builder = Application.builder().token(TELEGRAM_TOKEN)
916
  application_builder.request(custom_request)
917
+ # Also apply to get_updates if you were using polling (webhook doesn't use this heavily)
918
+ # application_builder.get_updates_request(custom_request)
919
+ # Apply connection pool settings globally if needed (less common now with direct request object)
920
+ # application_builder.pool_timeout(pool_timeout) # This might be redundant if set in HTTPXRequest
921
 
922
  # Build the application instance
923
  application = application_builder.build()
924
 
925
+ # --- Register Handlers ---
926
  application.add_handler(CommandHandler("start", start))
927
  application.add_handler(CommandHandler("help", help_command))
928
+ # Handles non-command text messages that might contain a URL
929
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
930
+ # Handles the button clicks ('paragraph' or 'points')
931
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
932
+ # Global error handler
933
  application.add_error_handler(error_handler)
934
 
935
+ logger.info("Telegram application handlers configured.")
936
  return application
937
 
938
  # --- ASGI Lifespan Context Manager ---
 
940
  async def lifespan(app: Starlette):
941
  """Handles PTB startup and shutdown during ASGI lifespan."""
942
  global ptb_app
943
+ logger.info("ASGI Lifespan: Startup sequence initiated...")
944
+ # loop = asyncio.get_running_loop() # Not usually needed directly
945
 
946
  try:
947
+ # --- Setup and Initialize PTB Application ---
948
  ptb_app = await setup_bot_config()
949
+ logger.info("PTB Application object configured. Initializing...")
950
+ await ptb_app.initialize() # Initialize application components (e.g., bot instance)
951
+ logger.info("PTB Application initialized. Starting background tasks (e.g., job queue)...")
952
+ # Start PTB's internal tasks but not polling (we use webhook)
953
  await ptb_app.start()
954
+ if ptb_app.updater: ptb_app.updater.stop() # Ensure polling is stopped if accidentally started
955
+ bot_instance = ptb_app.bot
956
+ bot_info = await bot_instance.get_me()
957
+ logger.info(f"PTB Application started successfully. Bot ID: {bot_info.id}, Username: @{bot_info.username}")
958
 
959
+ # --- Set Webhook ---
960
+ # Ensure SPACE_HOST is correctly set in Hugging Face Space secrets
961
  WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
962
  if WEBHOOK_URL_BASE:
963
+ # Ensure it's a proper HTTPS URL
964
  if not WEBHOOK_URL_BASE.startswith("https://"): WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
965
+ webhook_path = "/webhook" # Must match the route defined later
966
  full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
967
+
968
+ logger.info(f"Attempting to set Telegram webhook to: {full_webhook_url}")
969
+ # Short delay can sometimes help prevent race conditions on startup
970
+ await asyncio.sleep(2.0)
971
  try:
972
+ # Set the webhook, specifying allowed updates can reduce load
973
+ await bot_instance.set_webhook(
974
+ url=full_webhook_url,
975
+ allowed_updates=Update.ALL_TYPES, # Or specify like [Update.MESSAGE, Update.CALLBACK_QUERY]
976
+ # secret_token="YOUR_SECRET_TOKEN" # Recommended for security if possible
977
+ # drop_pending_updates=True # Optional: Ignore updates sent while bot was down
978
+ )
979
+ # Verify webhook setup
980
+ webhook_info = await bot_instance.get_webhook_info()
981
+ if webhook_info.url == full_webhook_url:
982
+ logger.info(f"Telegram webhook set successfully! Current info: {webhook_info}")
983
+ else:
984
+ logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got: {webhook_info.url}. Info: {webhook_info}")
985
  except RetryAfter as e:
986
+ # This can happen if multiple workers try to set the webhook simultaneously
987
+ logger.warning(f"Webhook setting throttled by Telegram (RetryAfter: {e.retry_after}s). Another instance likely succeeded or try again later.")
988
+ # Optionally check info again after delay
989
+ await asyncio.sleep(e.retry_after or 2)
990
+ webhook_info = await bot_instance.get_webhook_info()
991
+ logger.info(f"Webhook info after RetryAfter delay: {webhook_info}")
992
  except Exception as e:
993
+ logger.error(f"Failed to set Telegram webhook to {full_webhook_url}: {e}", exc_info=True)
994
+ else:
995
+ logger.warning("SPACE_HOST environment variable not found. Cannot set webhook automatically. Bot will not receive updates via webhook.")
996
 
997
+ logger.info("ASGI Lifespan: Startup complete. Application is ready to yield.")
998
+ yield # --- Application runs here ---
999
 
1000
  except Exception as startup_err:
1001
+ logger.critical(f"CRITICAL ERROR during ASGI application startup: {startup_err}", exc_info=True)
1002
+ # Re-raise the exception to potentially stop the ASGI server from starting improperly
1003
  raise
1004
  finally:
1005
+ # --- Shutdown Sequence ---
1006
+ logger.info("ASGI Lifespan: Shutdown sequence initiated...")
1007
+ if ptb_app:
1008
+ bot_username = ptb_app.bot.username if ptb_app.bot else "N/A"
1009
+ logger.info(f"PTB App instance found for @{bot_username}. Checking if running...")
1010
+ # Check internal state if available (e.g., ptb_app.running might exist in future versions)
1011
+ # Using _running is internal, but often the only way
1012
+ is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
1013
+ if is_running:
1014
+ try:
1015
+ logger.info("Stopping PTB Application's background tasks...")
1016
+ await ptb_app.stop() # Stop internal tasks like JobQueue
1017
+ logger.info("Shutting down PTB Application connections and resources...")
1018
+ await ptb_app.shutdown() # Clean up resources (e.g., close HTTPX client)
1019
+ logger.info("PTB Application shut down gracefully.")
1020
+ except Exception as shutdown_err:
1021
+ logger.error(f"Error during PTB Application shutdown: {shutdown_err}", exc_info=True)
1022
+ else:
1023
+ logger.warning("PTB Application instance exists but was not marked as running at shutdown.")
1024
+ # Attempt shutdown anyway just in case resources need cleaning
1025
+ try: await ptb_app.shutdown()
1026
+ except Exception: logger.error("Error during shutdown of non-running PTB app.", exc_info=True)
1027
+ else:
1028
+ logger.warning("No PTB Application instance (ptb_app) found during ASGI shutdown.")
1029
  logger.info("ASGI Lifespan: Shutdown complete.")
1030
 
1031
 
1032
+ # --- Flask App Setup (for Webhook Route) ---
1033
+ # We use Flask just for its familiarity in defining the route,
1034
+ # but it runs within Starlette's ASGI context via WSGIMiddleware.
1035
  flask_core_app = Flask(__name__)
1036
+ logger.info("Core Flask app instance created (used by Starlette for routing).")
1037
 
1038
+ # --- Define Flask Routes ---
1039
  @flask_core_app.route('/')
1040
  def index():
1041
  """Basic health check endpoint."""
1042
+ logger.debug("Health check endpoint '/' accessed.")
1043
+ bot_status = "Unknown / Not Initialized"
1044
+ if ptb_app and ptb_app.bot:
1045
+ # Check internal state again (might have changed)
1046
+ is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
1047
+ bot_status = f"Running (@{ptb_app.bot.username})" if is_running else f"Initialized/Stopped (@{ptb_app.bot.username})"
1048
+ return f"Telegram Bot Summarizer - Status: {bot_status} - Listening via Starlette/Uvicorn."
1049
 
1050
  @flask_core_app.route('/webhook', methods=['POST'])
1051
  async def webhook() -> Response:
1052
+ """Webhook endpoint called by Telegram."""
1053
+ global ptb_app # Ensure we're using the global instance initialized by lifespan
1054
+
1055
  if not ptb_app:
1056
+ logger.error("Webhook triggered, but PTB Application instance (ptb_app) is None. Lifespan likely failed.")
1057
+ # Return 503 Service Unavailable
1058
+ return Response('Bot service is not configured or failed during startup.', status=503)
1059
+
1060
+ # Check internal state (safer than assuming ptb_app implies running)
1061
+ is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
1062
+ if not is_running:
1063
+ logger.error("Webhook triggered, but PTB Application is not currently running.")
1064
+ # Return 503 Service Unavailable
1065
+ return Response('Bot service is initialized but not actively running.', status=503)
1066
+
1067
+ # Proceed with processing the update
1068
+ logger.debug("Webhook endpoint received POST request from Telegram.")
1069
+ try:
1070
+ # Use Flask's request object to get JSON data
1071
+ update_data = await request.get_json()
1072
+ if not update_data:
1073
+ logger.warning("Received empty or non-JSON data on webhook.")
1074
+ return Response('Bad Request: Expected JSON payload.', status=400)
1075
+
1076
+ # Deserialize JSON into a Telegram Update object
1077
+ update = Update.de_json(update_data, ptb_app.bot)
1078
+ logger.debug(f"Processing update_id: {update.update_id} via webhook route.")
1079
+
1080
+ # Process the update using PTB's internal mechanisms
1081
+ # This will dispatch it to the correct handler (CommandHandler, MessageHandler, etc.)
1082
+ await ptb_app.process_update(update)
1083
+
1084
+ logger.debug(f"Finished processing update_id: {update.update_id}")
1085
+ # Return 200 OK to Telegram to acknowledge receipt
1086
+ return Response('ok', status=200)
1087
+
1088
+ except json.JSONDecodeError:
1089
+ logger.error("Failed to decode JSON from Telegram webhook request.", exc_info=True)
1090
+ return Response('Bad Request: Invalid JSON format.', status=400)
1091
+ except Exception as e:
1092
+ # Catch potential errors during Update.de_json or ptb_app.process_update
1093
+ logger.error(f"Error processing update in webhook handler: {e}", exc_info=True)
1094
+ # Return 500 Internal Server Error to Telegram
1095
+ # Telegram will likely retry sending the update later
1096
+ return Response('Internal Server Error processing update.', status=500)
1097
+
1098
+
1099
+ # --- Create Starlette ASGI Application ---
1100
+ # This is the main application object that Uvicorn/Gunicorn will run.
1101
  app = Starlette(
1102
+ debug=False, # Set debug based on environment if needed, but generally False in prod
1103
+ lifespan=lifespan, # Hook into the lifespan context manager for startup/shutdown
1104
  routes=[
1105
+ # Mount the Flask app under the root path. Starlette handles requests
1106
+ # and forwards relevant ones ('/') and ('/webhook') to the Flask app.
1107
  Mount("/", app=WSGIMiddleware(flask_core_app))
1108
  ]
1109
  )
1110
+ logger.info("Starlette ASGI application created, configured with lifespan and Flask app mounted at '/'.")
1111
 
1112
 
1113
+ # --- Development Server Execution Block ---
1114
+ # This block is ONLY for running the Flask app directly for basic testing
1115
+ # WITHOUT the proper ASGI lifespan management (PTB won't start correctly here).
1116
+ # DO NOT use this for deployment. Use `gunicorn main:app` or `uvicorn main:app`.
1117
  if __name__ == '__main__':
1118
+ logger.warning("=" * 50)
1119
+ logger.warning(" RUNNING SCRIPT DIRECTLY (using __main__) ".center(50, "="))
1120
+ logger.warning("=" * 50)
1121
+ logger.warning("This mode starts the Flask development server.")
1122
+ logger.warning("!!! IT DOES **NOT** RUN THE ASGI LIFESPAN !!!")
1123
+ logger.warning("!!! The Telegram Bot (PTB Application) WILL NOT INITIALIZE OR RUN !!!")
1124
+ logger.warning("This is suitable ONLY for verifying Flask routes locally.")
1125
+ logger.warning("For proper testing/deployment, use: uvicorn main:app --reload --port 8080")
1126
+ logger.warning("or via Gunicorn: gunicorn -c gunicorn.conf.py main:app")
1127
+ logger.warning("=" * 50)
1128
+
1129
+ if not TELEGRAM_TOKEN:
1130
+ logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable missing. Aborting direct Flask start.")
1131
  else:
1132
+ # Get port from environment or default to 8080 for local dev
1133
  local_port = int(os.environ.get('PORT', 8080))
1134
+ logger.info(f"Starting Flask development server on http://0.0.0.0:{local_port}")
1135
+ # Run the Flask app directly (no Starlette, no lifespan, no PTB)
1136
+ # use_reloader=False is important if debugging PTB setup elsewhere
1137
+ flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)