fmab777 commited on
Commit
f55e243
·
verified ·
1 Parent(s): 9f2782c

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +333 -298
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Revised with Starlette-only routes and fixes)
2
  import os
3
  import re
4
  import logging
@@ -10,11 +10,10 @@ import traceback
10
  from typing import Optional, Dict, Any
11
 
12
  # --- Frameworks ---
13
- # Removed Flask imports
14
  from starlette.applications import Starlette
15
- from starlette.routing import Route # Changed from Mount
16
- from starlette.responses import PlainTextResponse, JSONResponse, Response # Starlette responses
17
- from starlette.requests import Request # Starlette request object
18
 
19
  # --- Telegram Bot ---
20
  from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
@@ -28,27 +27,37 @@ from telegram.ext import (
28
  )
29
  from telegram.constants import ParseMode
30
  from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError
31
- from telegram.request import HTTPXRequest
32
 
33
  # --- Other Libraries ---
34
  import httpx
35
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
36
- import requests
 
37
  from bs4 import BeautifulSoup
38
  from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log
 
 
 
 
 
 
 
39
 
40
  _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
41
  if _apify_token_exists:
42
  from apify_client import ApifyClient
43
  from apify_client.consts import ActorJobStatus
 
44
  else:
45
- ApifyClient = None # type: ignore # Make type checkers happy
 
46
 
47
 
48
  # --- Logging Setup ---
49
  logging.basicConfig(
50
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
51
- level=logging.INFO # Changed default to INFO, DEBUG is very verbose
52
  )
53
  logging.getLogger("httpx").setLevel(logging.WARNING)
54
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
@@ -59,7 +68,7 @@ logging.getLogger('gunicorn.error').setLevel(logging.INFO)
59
  logging.getLogger('uvicorn').setLevel(logging.INFO)
60
  logging.getLogger('starlette').setLevel(logging.INFO)
61
  logger = logging.getLogger(__name__)
62
- logger.info("Logging configured.")
63
 
64
  # --- Global variable for PTB app ---
65
  ptb_app: Optional[Application] = None
@@ -68,8 +77,10 @@ ptb_app: Optional[Application] = None
68
  logger.info("Attempting to load secrets...")
69
  def get_secret(secret_name):
70
  value = os.environ.get(secret_name)
71
- if value: logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})")
72
- else: logger.warning(f"Secret '{secret_name}': Not Found")
 
 
73
  return value
74
 
75
  TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
@@ -77,48 +88,57 @@ OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY')
77
  URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
78
  SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
79
  APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
 
 
 
80
  logger.info("Secret loading attempt finished.")
 
 
 
 
81
 
82
  # --- Retry Decorator for Bot Operations ---
83
- # Increased attempts slightly and added logging before sleep
84
- # Explicitly catch TelegramError subclasses for retries
85
  @retry(
86
- stop=stop_after_attempt(4), # Increased slightly
87
- wait=wait_exponential(multiplier=1, min=2, max=15), # Adjusted wait
88
- retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), # Specific Telegram errors
89
- before_sleep=before_sleep_log(logger, logging.WARNING), # Use tenacity logger
90
- reraise=True # Reraise the exception if all retries fail
91
  )
92
  async def retry_bot_operation(func, *args, **kwargs):
93
  """Wrapper to retry bot operations with exponential backoff."""
94
- # Note: This is now a wrapper function, not a decorator factory
95
- # Call it like: await retry_bot_operation(bot.send_message, chat_id=..., text=...)
96
  try:
97
  return await func(*args, **kwargs)
98
  except BadRequest as e:
99
- # Don't retry on bad requests that are likely permanent (e.g., chat not found)
100
- # unless it's a common transient issue like message not modified
101
- if "message is not modified" in str(e).lower():
102
- logger.warning(f"Ignoring 'message not modified' error: {e}")
103
- return None # Or indicate success/no action needed
104
- logger.error(f"BadRequest during bot operation (will not retry unless specific): {e}")
105
- raise # Reraise non-retryable BadRequests
 
 
 
 
 
 
106
  except TelegramError as e:
107
- logger.warning(f"TelegramError during bot operation (will retry): {e}")
108
- raise # Reraise retryable errors for tenacity
109
  except Exception as e:
110
  logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
111
- raise # Reraise unexpected errors
 
112
 
113
  # --- Helper Functions ---
114
  def is_youtube_url(url):
115
  """Checks if the URL is a valid YouTube video or shorts URL."""
116
- # More robust regex to handle various youtube domain variations and query params
117
  youtube_regex = re.compile(
118
  r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/'
119
  r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?'
120
- r'([\w-]{11})' # Group 1: Video ID
121
- r'(?:\S+)?', # Optional non-whitespace characters after ID
122
  re.IGNORECASE)
123
  match = youtube_regex.search(url)
124
  logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}")
@@ -126,11 +146,10 @@ def is_youtube_url(url):
126
 
127
  def extract_youtube_id(url):
128
  """Extracts the YouTube video ID from a URL."""
129
- # Use the same robust regex as is_youtube_url
130
  youtube_regex = re.compile(
131
  r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/'
132
  r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?'
133
- r'([\w-]{11})' # Group 1: Video ID
134
  r'(?:\S+)?',
135
  re.IGNORECASE)
136
  match = youtube_regex.search(url)
@@ -148,23 +167,45 @@ def extract_youtube_id(url):
148
  async def fetch_url_content(url: str, timeout: int = 20) -> Optional[str]:
149
  """Fetches content from a URL using httpx asynchronously."""
150
  headers = {
151
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
152
  }
153
  try:
154
- async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
155
  response = await client.get(url)
156
- response.raise_for_status() # Raise an exception for bad status codes
157
- # Detect encoding, fallback to UTF-8
158
- response.encoding = response.apparent_encoding or 'utf-8'
159
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  except httpx.HTTPStatusError as e:
161
  logger.error(f"HTTP error fetching {url}: {e.response.status_code} - {e}")
 
 
 
 
 
162
  except httpx.RequestError as e:
163
  logger.error(f"Request error fetching {url}: {e}")
164
  except Exception as e:
165
  logger.error(f"Unexpected error fetching {url}: {e}", exc_info=True)
166
  return None
167
 
 
168
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
169
  """Fetches YouTube transcript using Supadata API."""
170
  if not api_key: return None
@@ -172,6 +213,8 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
172
  headers = {'X-API-Key': api_key, 'Accept': 'application/json'}
173
  logger.info(f"Attempting transcript fetch via Supadata for {video_id}")
174
  try:
 
 
175
  async with httpx.AsyncClient(timeout=30.0) as client:
176
  response = await client.get(api_url, headers=headers)
177
  response.raise_for_status()
@@ -183,6 +226,12 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
183
  else:
184
  logger.warning(f"Supadata response format unexpected or empty for {video_id}: {data}")
185
  return None
 
 
 
 
 
 
186
  except httpx.HTTPStatusError as e:
187
  logger.error(f"Supadata API HTTP error for {video_id}: {e.response.status_code} - {e}")
188
  except Exception as e:
@@ -191,39 +240,68 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
191
 
192
  async def get_transcript_via_apify(video_id: str, api_token: str) -> Optional[str]:
193
  """Fetches YouTube transcript using Apify YouTube Scraper Actor."""
 
194
  if not ApifyClient or not api_token: return None
195
- logger.info(f"Attempting transcript fetch via Apify for {video_id}")
196
  try:
197
  client = ApifyClient(api_token)
198
- actor_call = client.actor("clockworks/youtube-scraper").call(
199
- run_input={
200
- "startUrls": [f"https://www.youtube.com/watch?v={video_id}"],
201
- "maxResultStreams": 0,
202
- "maxResults": 0,
203
- "maxResultCommentStreams": 0,
204
- "proxyConfiguration": {"useApifyProxy": True},
 
 
 
 
 
 
 
205
  "subtitles": True, # Explicitly request subtitles/transcript
206
- "extendOutputFunction": 'async ({ data, item, page, request, customData, Apify }) => { return item; }',
207
- }
 
 
 
 
208
  )
209
 
210
- # Wait for the actor run to complete and fetch results
211
- dataset_items = client.dataset(actor_call["defaultDatasetId"]).list_items().items
212
- if dataset_items:
213
- # Look for transcript data within the results
214
- for item in dataset_items:
215
- if 'subtitles' in item and isinstance(item['subtitles'], list) and len(item['subtitles']) > 0:
216
- # Combine transcript lines, assuming standard format
217
- transcript = " ".join(line.get('text', '') for line in item['subtitles'][0].get('lines', []))
218
- if transcript.strip():
219
- logger.info(f"Apify transcript fetched successfully for {video_id} (length: {len(transcript)})")
220
- return transcript.strip()
221
- logger.warning(f"Apify run completed for {video_id}, but no transcript found in results.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  else:
223
- logger.warning(f"Apify run completed for {video_id}, but dataset was empty.")
224
 
 
 
 
225
  except Exception as e:
226
- logger.error(f"Error fetching transcript via Apify for {video_id}: {e}", exc_info=True)
227
  return None
228
 
229
 
@@ -239,7 +317,6 @@ async def get_youtube_transcript(video_id: str, url: str, supadata_key: Optional
239
  # 2. Try youtube-transcript-api (Direct method)
240
  logger.info(f"Attempting transcript fetch via youtube-transcript-api for {video_id}")
241
  try:
242
- # Run in executor to avoid blocking async loop
243
  transcript_list = await asyncio.to_thread(YouTubeTranscriptApi.get_transcript, video_id)
244
  transcript = " ".join([item['text'] for item in transcript_list])
245
  logger.info(f"youtube-transcript-api transcript fetched successfully for {video_id} (length: {len(transcript)})")
@@ -265,14 +342,16 @@ async def get_website_content_via_requests(url: str) -> Optional[str]:
265
  return None
266
 
267
  try:
268
- # Run BeautifulSoup parsing in executor thread
269
  def parse_html(content):
270
- soup = BeautifulSoup(content, 'html.parser')
271
- # Remove script and style elements
272
- for script_or_style in soup(["script", "style", "nav", "footer", "aside"]):
273
  script_or_style.decompose()
274
- # Get text, strip whitespace, join lines
275
- text = soup.get_text(separator='\n', strip=True)
 
 
 
276
  lines = (line.strip() for line in text.splitlines())
277
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
278
  text = '\n'.join(chunk for chunk in chunks if chunk)
@@ -280,7 +359,7 @@ async def get_website_content_via_requests(url: str) -> Optional[str]:
280
 
281
  text_content = await asyncio.to_thread(parse_html, html_content)
282
 
283
- if text_content and len(text_content) > 100: # Basic check for meaningful content
284
  logger.info(f"Successfully scraped content via requests/BeautifulSoup for {url} (length: {len(text_content)})")
285
  return text_content
286
  else:
@@ -295,11 +374,11 @@ async def get_website_content_via_urltotext_api(url: str, api_key: str) -> Optio
295
  if not api_key: return None
296
  api_endpoint = "https://api.urltotext.ai/text"
297
  headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
298
- payload = {"url": url, "text_only": True} # Requesting only text content
299
  logger.info(f"Attempting website content fetch via UrlToText API for: {url}")
300
 
301
  try:
302
- async with httpx.AsyncClient(timeout=45.0) as client: # Increased timeout
303
  response = await client.post(api_endpoint, headers=headers, json=payload)
304
  response.raise_for_status()
305
  data = response.json()
@@ -310,6 +389,9 @@ async def get_website_content_via_urltotext_api(url: str, api_key: str) -> Optio
310
  else:
311
  logger.warning(f"UrlToText API response did not contain text for {url}. Response: {data}")
312
  return None
 
 
 
313
  except httpx.HTTPStatusError as e:
314
  logger.error(f"UrlToText API HTTP error for {url}: {e.response.status_code} - {e}")
315
  except Exception as e:
@@ -319,25 +401,22 @@ async def get_website_content_via_urltotext_api(url: str, api_key: str) -> Optio
319
  # --- Summarization Function ---
320
  async def generate_summary(content: str, summary_type: str, api_key: Optional[str]) -> str:
321
  """Generates a summary using OpenRouter API."""
 
322
  if not api_key:
323
  return "Error: OpenRouter API key is not configured."
324
  if not content:
325
  return "Error: No content provided to summarize."
326
 
327
- # Basic check for content length
328
  if len(content) < 50:
329
  return "The provided content is too short to summarize effectively."
330
 
331
- # Truncate content if too long (adjust limit as needed based on model context window)
332
- # This limit depends heavily on the chosen model. Claude Sonnet 3.5 has 200k tokens.
333
- # Let's aim for roughly 100k characters as a safe-ish limit for many models.
334
  max_chars = 100000
335
  if len(content) > max_chars:
336
  logger.warning(f"Content length ({len(content)}) exceeds max_chars ({max_chars}), truncating.")
337
  content = content[:max_chars]
338
 
339
  prompt_template = """
340
- Please summarize the following text.
341
  Provide the summary in {format_style} format.
342
 
343
  Text to summarize:
@@ -348,26 +427,26 @@ Text to summarize:
348
  Summary ({format_style}):
349
  """
350
  format_style = "a concise paragraph" if summary_type == "paragraph" else "bullet points (using * or - for each point)"
351
-
352
  prompt = prompt_template.format(text=content, format_style=format_style)
353
 
354
- # Recommended model: claude-3.5-sonnet-20240620, but allow others
355
- model_to_use = os.environ.get("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet") # Default to Claude 3.5 Sonnet
356
-
357
- logger.info(f"Sending request to OpenRouter (Model: {model_to_use}) for {summary_type} summary.")
358
 
359
  try:
360
- async with httpx.AsyncClient(timeout=120.0) as client: # Increased timeout for generation
361
  response = await client.post(
362
  url="https://openrouter.ai/api/v1/chat/completions",
363
  headers={
364
  "Authorization": f"Bearer {api_key}",
365
- "Content-Type": "application/json"
 
 
 
366
  },
367
  json={
368
- "model": model_to_use,
369
  "messages": [{"role": "user", "content": prompt}],
370
- "max_tokens": 1024, # Adjust as needed
 
371
  },
372
  )
373
  response.raise_for_status()
@@ -377,22 +456,25 @@ Summary ({format_style}):
377
  summary = data["choices"][0].get("message", {}).get("content", "").strip()
378
  if summary:
379
  logger.info(f"Summary generated successfully (length: {len(summary)})")
380
- # Simple Markdown sanitization (escape potential conflicts)
 
 
 
381
  summary = summary.replace('_', r'\_').replace('*', r'\*').replace('[', r'\[').replace('`', r'\`')
382
  return summary
383
  else:
384
  logger.error("OpenRouter response successful, but summary content is empty.")
385
  return "Sorry, the AI generated an empty summary. Please try again."
386
  else:
387
- logger.error(f"OpenRouter response format unexpected: {data}")
388
- return "Sorry, I received an unexpected response from the summarization service."
 
 
389
 
390
  except httpx.HTTPStatusError as e:
391
  error_body = ""
392
- try:
393
- error_body = e.response.text
394
- except Exception:
395
- pass # Ignore if reading body fails
396
  logger.error(f"OpenRouter API HTTP error: {e.response.status_code} - {e}. Response body: {error_body}")
397
  return f"Sorry, there was an error communicating with the summarization service (HTTP {e.response.status_code})."
398
  except Exception as e:
@@ -404,7 +486,7 @@ Summary ({format_style}):
404
  async def process_summary_task(
405
  user_id: int,
406
  chat_id: int,
407
- message_id_to_edit: Optional[int], # Can be None if original message couldn't be edited
408
  url: str,
409
  summary_type: str,
410
  bot_token: str
@@ -413,94 +495,82 @@ async def process_summary_task(
413
  task_id = f"{user_id}-{message_id_to_edit or 'new'}"
414
  logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
415
 
416
- # Create a new bot instance for this task
417
- # Use longer timeouts for background tasks which might involve heavy network I/O
418
- custom_request = HTTPXRequest(
419
- connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0, http_version="1.1"
420
- )
421
- bot = Bot(token=bot_token, request=custom_request)
 
 
 
 
 
 
422
 
423
  content = None
424
  user_feedback_message = None
425
  success = False
426
  final_summary = ""
427
- status_message_id = message_id_to_edit # Start assuming we can edit the original
428
 
429
  try:
430
- # --- Inform User Processing Has Started (Edit original or send new) ---
431
  processing_message_text = f"⏳ Working on your '{summary_type}' summary for:\n`{url}`\n\n_(Fetching & summarizing...)_"
432
-
433
  if status_message_id:
434
  try:
435
  await retry_bot_operation(
436
- bot.edit_message_text,
437
- chat_id=chat_id,
438
- message_id=status_message_id,
439
- text=processing_message_text,
440
- parse_mode=ParseMode.MARKDOWN,
441
- reply_markup=None # Remove buttons
442
  )
443
  logger.debug(f"[Task {task_id}] Successfully edited message {status_message_id} to 'Processing'")
444
  except Exception as e:
445
  logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Will send a new status message.")
446
- status_message_id = None # Flag to send a new message
447
-
448
- if not status_message_id: # If editing failed or wasn't possible
449
  try:
450
  status_message = await retry_bot_operation(
451
- bot.send_message,
452
- chat_id=chat_id,
453
- text=processing_message_text,
454
- parse_mode=ParseMode.MARKDOWN
455
  )
456
- status_message_id = status_message.message_id
457
- logger.debug(f"[Task {task_id}] Sent new status message {status_message_id}")
 
 
 
 
458
  except Exception as e:
459
  logger.error(f"[Task {task_id}] Failed to send new status message: {e}")
460
- # Cannot proceed without informing user
461
  raise RuntimeError("Failed to send initial status message") from e
462
 
463
  # --- Main Content Fetching and Summarization ---
464
  try:
465
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
466
-
467
- # --- Determine Content Type and Fetch ---
468
  is_yt = is_youtube_url(url)
469
  logger.debug(f"[Task {task_id}] URL is YouTube: {is_yt}")
470
-
471
  if is_yt:
472
  video_id = extract_youtube_id(url)
473
  if video_id:
474
  logger.info(f"[Task {task_id}] Fetching YouTube transcript for {video_id}")
475
  content = await get_youtube_transcript(video_id, url, SUPADATA_API_KEY, APIFY_API_TOKEN)
476
- if not content:
477
- user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video. It might be disabled or unavailable."
478
- else:
479
- user_feedback_message = "⚠️ Couldn't extract a valid YouTube video ID from the link."
480
  else:
481
  logger.info(f"[Task {task_id}] Attempting website scrape for: {url}")
482
  content = await get_website_content_via_requests(url)
483
  if not content and URLTOTEXT_API_KEY:
484
- logger.info(f"[Task {task_id}] Basic scrape failed or insufficient, trying UrlToText API...")
485
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
486
  content = await get_website_content_via_urltotext_api(url, URLTOTEXT_API_KEY)
 
487
 
488
- if not content:
489
- user_feedback_message = "⚠️ Sorry, I couldn't fetch or extract meaningful content from that website."
490
-
491
- # --- Generate Summary if Content Was Fetched ---
492
  if content:
493
  logger.info(f"[Task {task_id}] Content fetched (length: {len(content)}). Generating '{summary_type}' summary.")
494
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
495
-
496
  final_summary = await generate_summary(content, summary_type, OPENROUTER_API_KEY)
497
-
498
  if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
499
- user_feedback_message = f"⚠️ {final_summary}" # Use the error from generate_summary
500
- else:
501
- success = True # Summary generated successfully
502
-
503
- # If content fetching failed initially, user_feedback_message is already set
504
 
505
  except Exception as e:
506
  logger.error(f"[Task {task_id}] Error during content fetching or summarization: {e}", exc_info=True)
@@ -508,86 +578,75 @@ async def process_summary_task(
508
 
509
  # --- Send Final Result or Error ---
510
  if success and final_summary:
511
- # Split summary if it's too long for one message
512
  max_length = 4096
513
  summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
514
-
515
- # Send the first part (or the only part)
516
  await retry_bot_operation(
517
- bot.send_message,
518
- chat_id=chat_id,
519
- text=summary_parts[0],
520
- parse_mode=ParseMode.MARKDOWN,
521
- link_preview_options={'is_disabled': True} # Changed to dict format for PTB v21+
522
  )
523
- # Send subsequent parts if any
524
  for part in summary_parts[1:]:
525
- await asyncio.sleep(0.5) # Small delay between parts
526
  await retry_bot_operation(
527
- bot.send_message,
528
- chat_id=chat_id,
529
- text=part,
530
- parse_mode=ParseMode.MARKDOWN,
531
- link_preview_options={'is_disabled': True}
532
  )
533
  logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
534
-
535
- elif user_feedback_message: # Handle errors (either from fetching or summarization)
536
  logger.warning(f"[Task {task_id}] Sending feedback/error message: {user_feedback_message}")
537
  await retry_bot_operation(
538
- bot.send_message,
539
- chat_id=chat_id,
540
- text=user_feedback_message,
541
- link_preview_options={'is_disabled': True}
542
  )
543
-
544
- else: # Should not happen, but safety net
545
  logger.error(f"[Task {task_id}] Reached end of task without success or specific error message.")
546
  await retry_bot_operation(
547
- bot.send_message,
548
- chat_id=chat_id,
549
- text="❓ Something went wrong, but no specific error was identified.",
550
  link_preview_options={'is_disabled': True}
551
  )
552
 
553
-
554
  except Exception as e:
555
  logger.critical(f"[Task {task_id}] Critical error within task processing: {e}", exc_info=True)
556
  try:
557
- await retry_bot_operation(
558
- bot.send_message,
559
- chat_id=chat_id,
560
- text="❌ A critical internal error occurred. Please report this if it persists."
561
- )
 
 
 
562
  except Exception:
563
  logger.exception(f"[Task {task_id}] Failed even to send critical error message.")
564
  finally:
565
  # --- Clean up Status Message ---
566
- if status_message_id:
567
  try:
568
  await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=status_message_id)
569
  logger.debug(f"[Task {task_id}] Deleted status message {status_message_id}")
570
  except Exception as e:
 
571
  logger.warning(f"[Task {task_id}] Failed to delete status message {status_message_id}: {e}")
572
 
573
- # Ensure bot session is closed
574
- if bot and bot.session:
 
575
  try:
576
- await bot.shutdown() # Use bot.shutdown() which includes closing the session
577
- logger.debug(f"[Task {task_id}] Background bot instance shut down.")
578
  except Exception as e:
579
- logger.warning(f"[Task {task_id}] Error shutting down background bot instance: {e}")
 
 
580
 
581
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
582
 
583
 
584
- # --- Telegram Bot Handlers ---
585
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
586
  """Handles the /start command."""
587
  user = update.effective_user
588
  if not user or not update.message: return
589
  logger.info(f"User {user.id} initiated /start.")
590
- mention = user.mention_html() # mention_html is safer
591
  start_message = (
592
  f"👋 Hello {mention}!\n\n"
593
  "I can summarise YouTube videos or web articles for you.\n\n"
@@ -608,13 +667,12 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
608
  "3. Click the button for your preferred format.\n"
609
  "4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
610
  "**Important Notes:**\n"
611
- "- **YouTube:** Getting transcripts can sometimes fail if they are disabled or unavailable.\n"
612
- "- **Websites:** I try my best, but complex or JavaScript-heavy sites might not work perfectly.\n"
613
- "- **AI Summaries:** The AI aims for accuracy but may occasionally miss nuances or make errors.\n"
614
- "- **Length Limits:** Very long videos or articles might be truncated before summarization.\n\n"
615
  "Just send a link to get started!"
616
- )
617
- # Use reply_markdown_v2 for safer Markdown parsing if needed, but MARKDOWN should suffice here
618
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
619
 
620
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
@@ -624,23 +682,21 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
624
  user = update.effective_user
625
  if not user: return
626
 
627
- # Improved URL regex (simplified for matching, not validation)
628
  url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
629
  match = re.search(url_pattern, message_text)
630
 
631
  if match:
632
  url = match.group(0)
633
- # Basic cleanup: remove trailing characters like periods or commas if they likely aren't part of URL
634
- url = re.sub(r'[.,!?)\]>]+$', '', url)
635
  logger.info(f"User {user.id} sent potential URL: {url}")
636
 
637
- # Store URL and the original message ID for potential editing
638
  context.user_data['url_to_summarize'] = url
639
- context.user_data['original_message_id'] = update.message.message_id # Store for potential edit
640
 
641
  keyboard = [
642
  [
643
- InlineKeyboardButton("📜 Paragraph", callback_data="paragraph"), # Shortened labels
644
  InlineKeyboardButton("🔹 Bullet Points", callback_data="points")
645
  ]
646
  ]
@@ -652,39 +708,30 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
652
  link_preview_options={'is_disabled': True}
653
  )
654
  elif not message_text.startswith('/'):
655
- # Avoid replying to non-URL, non-command text to prevent noise
656
  logger.debug(f"User {user.id} sent non-URL, non-command text: '{message_text[:50]}...'")
657
- # Optional: Send a hint if it looks like they *tried* to send a URL
658
- if "http" in message_text or "www." in message_text:
659
  await update.message.reply_text("Hmm, that looks like it might be a link, but please ensure it starts with `http://` or `https://` and is a valid URL.")
660
- # else: do nothing
661
 
662
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
663
  """Handles button presses for summary type selection."""
664
  query = update.callback_query
665
  if not query or not query.message or not query.from_user:
666
  logger.warning("Callback query received without essential data.")
667
- if query: await query.answer() # Try to answer anyway
668
  return
669
 
670
  user = query.from_user
671
  summary_type = query.data
672
  query_id = query.id
673
 
674
- # --- Acknowledge button press ---
675
- # This is where the original error occurred. Trying this early.
676
  try:
677
  await query.answer()
678
  logger.debug(f"Acknowledged callback query {query_id} from user {user.id}")
679
  except Exception as e:
680
- # Log the error but try to continue, as acknowledging isn't strictly critical
681
  logger.error(f"Error answering callback query {query_id} from user {user.id}: {e}", exc_info=True)
682
- # The original NetworkError might still happen here in problematic environments.
683
- # If it persists, it points to deeper issues with httpx/anyio/uvloop/PTB interaction.
684
 
685
- # --- Retrieve URL and Original Message ID ---
686
  url = context.user_data.get('url_to_summarize')
687
- # Use the message ID from the callback query's message - this IS the message with buttons
688
  message_id_to_edit = query.message.message_id
689
 
690
  logger.info(f"User {user.id} chose summary type '{summary_type}' for URL associated with message {message_id_to_edit}")
@@ -692,14 +739,14 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
692
  if not url:
693
  logger.warning(f"No URL found in user_data for user {user.id} (callback query {query_id}). Editing message.")
694
  try:
 
695
  await query.edit_message_text(text="⚠️ Oops! I couldn't find the link associated with this request. Please send the link again.")
696
  except Exception as e:
697
  logger.error(f"Failed to edit message to show 'URL not found' error: {e}")
698
  return
699
 
700
- # --- Clear user_data and schedule task ---
701
  context.user_data.pop('url_to_summarize', None)
702
- context.user_data.pop('original_message_id', None) # Clear stored ID
703
 
704
  if not TELEGRAM_TOKEN:
705
  logger.critical("TELEGRAM_TOKEN is missing, cannot start background task!")
@@ -713,27 +760,34 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
713
  process_summary_task(
714
  user_id=user.id,
715
  chat_id=query.message.chat_id,
716
- message_id_to_edit=message_id_to_edit, # Pass the ID of the message with buttons
717
  url=url,
718
  summary_type=summary_type,
719
- bot_token=TELEGRAM_TOKEN # Pass token
720
  ),
721
  name=f"SummaryTask-{user.id}-{message_id_to_edit}"
722
  )
723
 
724
 
725
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
726
- """Log Errors caused by Updates or background tasks."""
 
 
 
 
 
727
  logger.error("Exception while handling an update:", exc_info=context.error)
728
 
729
- # Optionally, inform the user about backend errors if appropriate
730
- # Be careful not to spam users if the error is persistent
731
  # if isinstance(context.error, (NetworkError, TimedOut)):
732
- # # Potentially inform user about temporary issues
733
- # pass
734
- # elif isinstance(context.error, TelegramError):
735
- # # Handle specific Telegram API errors if needed
736
- # pass
 
 
 
737
 
738
 
739
  # --- Bot Setup Function ---
@@ -743,43 +797,31 @@ async def setup_bot_config() -> Application:
743
  if not TELEGRAM_TOKEN:
744
  raise ValueError("TELEGRAM_TOKEN environment variable not set.")
745
 
746
- # Configure httpx client for PTB (timeouts are important)
747
  custom_request = HTTPXRequest(
748
- connect_timeout=10.0,
749
- read_timeout=30.0, # Read timeout for API calls like getMe, getUpdates etc.
750
- write_timeout=30.0,
751
- pool_timeout=60.0, # Allow keeping connections open longer
752
- http_version="1.1" # Stick to 1.1 for wider compatibility
753
  )
754
 
755
  application = (
756
  Application.builder()
757
  .token(TELEGRAM_TOKEN)
758
  .request(custom_request)
759
- # Consider connection_pool_size if expecting high concurrency
760
- # .connection_pool_size(10)
761
  .build()
762
  )
763
 
764
- # Add handlers
765
  application.add_handler(CommandHandler("start", start))
766
  application.add_handler(CommandHandler("help", help_command))
767
- # Use TEXT filter combined with URL entity/filter for better targeting?
768
- # For simplicity, keeping the regex check inside the handler for now.
769
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
770
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
771
-
772
- # Add error handler
773
  application.add_error_handler(error_handler)
774
 
775
  logger.info("Telegram application handlers configured.")
776
  return application
777
 
778
- # --- ASGI Lifespan Context Manager (Mostly Unchanged) ---
779
  @contextlib.asynccontextmanager
780
  async def lifespan(app: Starlette):
781
  """Handles PTB startup and shutdown during ASGI lifespan."""
782
- global ptb_app
783
  logger.info("ASGI Lifespan: Startup sequence initiated...")
784
 
785
  if not TELEGRAM_TOKEN:
@@ -789,83 +831,76 @@ async def lifespan(app: Starlette):
789
  bot_info_text = "Bot info not available yet."
790
  try:
791
  ptb_app = await setup_bot_config()
792
- await ptb_app.initialize() # Initialize application resources
793
-
794
  bot_info = await ptb_app.bot.get_me()
795
  bot_info_text = f"@{bot_info.username} (ID: {bot_info.id})"
796
  logger.info(f"Bot initialized: {bot_info_text}")
797
 
798
- # Check for existing webhook and potentially delete it before setting a new one
799
  current_webhook_info = await ptb_app.bot.get_webhook_info()
800
  if current_webhook_info and current_webhook_info.url:
801
  logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete it.")
802
  try:
803
- await ptb_app.bot.delete_webhook(drop_pending_updates=True)
804
- logger.info("Existing webhook deleted successfully.")
 
 
 
805
  except Exception as e:
806
  logger.warning(f"Could not delete existing webhook: {e}")
807
- await asyncio.sleep(1) # Short pause before setting new one
808
 
809
-
810
- # Determine Webhook URL from Hugging Face environment variables
811
  space_host = os.environ.get("SPACE_HOST")
812
- webhook_path = "/webhook" # Keep it simple
813
  full_webhook_url = None
814
-
815
  if space_host:
816
- # Ensure it starts with https://
817
- if not space_host.startswith("https://"):
818
- # HF usually provides full host like 'user-repo.hf.space'
819
- # Prepend https:// if missing
820
- if not space_host.startswith("http"):
821
- full_webhook_url = f"https://{space_host.rstrip('/')}{webhook_path}"
822
- else: # If it starts with http://, replace it (HF uses https)
823
- full_webhook_url = f"https://{space_host.split('://')[1].rstrip('/')}{webhook_path}"
824
 
825
  if full_webhook_url:
826
  logger.info(f"Attempting to set webhook to: {full_webhook_url}")
827
- await asyncio.sleep(2.0) # Give network time to settle
 
 
 
 
 
 
 
 
 
 
828
  try:
829
- await ptb_app.bot.set_webhook(
830
- url=full_webhook_url,
831
- allowed_updates=Update.ALL_TYPES, # Or specify types like ['message', 'callback_query']
832
- drop_pending_updates=True,
833
- secret_token=os.environ.get("WEBHOOK_SECRET") # Optional: Add a secret token for security
834
- )
835
  webhook_info = await ptb_app.bot.get_webhook_info()
836
- logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Pending={webhook_info.pending_update_count}")
837
- # Start PTB's internal runners *after* setting the webhook
 
 
 
 
 
838
  await ptb_app.start()
839
  logger.info("PTB Application started (webhook mode). Ready for updates.")
840
  except Exception as e:
841
  logger.error(f"FATAL: Failed to set webhook to {full_webhook_url}: {e}", exc_info=True)
842
- # Decide how to handle failure: maybe try polling or just exit?
843
- # For HF Spaces webhook is preferred. Raising error might be best.
844
  raise RuntimeError(f"Failed to set webhook: {e}") from e
845
  else:
846
- logger.warning("Could not construct valid HTTPS webhook URL from SPACE_HOST.")
847
- # Fallback or error needed here? For HF, webhook is essential.
848
  raise RuntimeError("Webhook URL could not be determined.")
849
  else:
850
- logger.warning("SPACE_HOST environment variable not found. Cannot set webhook.")
851
- # Maybe fall back to polling for local dev? Not ideal for HF.
852
- # logger.info("Starting PTB in polling mode (no SPACE_HOST found).")
853
- # await ptb_app.start() # Start runners
854
- # application.run_polling(drop_pending_updates=True) # Start polling - BLOCKING! Not suitable for ASGI app.
855
  raise RuntimeError("SPACE_HOST env var missing, cannot run in webhook mode.")
856
 
857
-
858
  logger.info("ASGI Lifespan: Startup complete.")
859
  yield # Application runs here
860
 
861
  except Exception as startup_err:
862
  logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
863
- # Ensure cleanup happens even if startup fails partially
864
- if ptb_app and ptb_app.running:
865
- await ptb_app.stop()
866
  if ptb_app:
 
867
  await ptb_app.shutdown()
868
- raise # Reraise the error to stop the ASGI server
869
  finally:
870
  logger.info("ASGI Lifespan: Shutdown sequence initiated...")
871
  if ptb_app:
@@ -873,7 +908,7 @@ async def lifespan(app: Starlette):
873
  logger.info("Stopping PTB application...")
874
  await ptb_app.stop()
875
  logger.info("Shutting down PTB application...")
876
- await ptb_app.shutdown()
877
  logger.info("PTB Application shut down gracefully.")
878
  else:
879
  logger.info("PTB application was not initialized or startup failed.")
@@ -886,55 +921,54 @@ async def health_check(request: Request) -> PlainTextResponse:
886
  bot_status = "Not Initialized"
887
  if ptb_app and ptb_app.bot:
888
  try:
889
- # Perform a lightweight check like get_me again, or use a flag
890
  if ptb_app.running:
891
- bot_info = await ptb_app.bot.get_me() # Cached usually
 
892
  bot_status = f"Running (@{bot_info.username})"
893
  else:
894
  bot_status = "Initialized but not running"
895
  except Exception as e:
896
  bot_status = f"Error checking status: {e}"
 
897
 
898
- return PlainTextResponse(f"Telegram Bot Summarizer - Status: {bot_status}")
899
 
900
  async def telegram_webhook(request: Request) -> Response:
901
  """Webhook endpoint called by Telegram."""
 
902
  if not ptb_app:
903
  logger.error("Webhook received but PTB application not initialized.")
904
  return PlainTextResponse('Bot not initialized', status_code=503)
905
  if not ptb_app.running:
906
  logger.warning("Webhook received but PTB application not running.")
907
- # Maybe return 200 OK anyway so Telegram doesn't retry too much?
908
- # Or 503 Service Unavailable
909
  return PlainTextResponse('Bot initialized but not running', status_code=503)
910
 
911
  try:
912
- # Check secret token if configured
913
- secret_token = os.environ.get("WEBHOOK_SECRET")
914
- if secret_token:
915
- if request.headers.get("X-Telegram-Bot-Api-Secret-Token") != secret_token:
916
- logger.warning("Webhook received with invalid secret token.")
917
- return Response(status_code=403) # Forbidden
918
-
919
- # Process the update
920
  update_data = await request.json()
921
  update = Update.de_json(data=update_data, bot=ptb_app.bot)
922
- logger.debug(f"Processing update_id: {update.update_id}")
 
923
  await ptb_app.process_update(update)
924
- return Response(status_code=200) # OK
 
925
 
926
  except json.JSONDecodeError:
927
  logger.error("Webhook received invalid JSON.")
928
  return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
929
  except Exception as e:
930
  logger.error(f"Error processing webhook update: {e}", exc_info=True)
931
- # Return 200 OK even on errors to prevent Telegram retries if the error is in processing logic
932
- # Alternatively return 500 if it's a server-side issue. 200 is often safer for webhooks.
933
- return Response(status_code=200) # OK despite error
934
 
935
  # --- Create Starlette ASGI Application ---
936
  app = Starlette(
937
- debug=False, # Set debug based on environment if needed
938
  lifespan=lifespan,
939
  routes=[
940
  Route("/", endpoint=health_check, methods=["GET"]),
@@ -944,10 +978,11 @@ app = Starlette(
944
  logger.info("Starlette ASGI application created with native routes.")
945
 
946
  # --- Development Server Execution Block (Optional) ---
947
- # This block is generally NOT used when deploying with Gunicorn/Uvicorn
948
- # Keep it for potential local testing without gunicorn
949
  if __name__ == '__main__':
950
  import uvicorn
951
  logger.warning("Running in development mode using Uvicorn directly (not for production)")
952
- local_port = int(os.environ.get('PORT', 8080)) # Use PORT if set, else 8080
953
- uvicorn.run(app, host='0.0.0.0', port=local_port, log_level="info")
 
 
 
 
1
+ # main.py (Applying fixes for apparent_encoding, bot cleanup, and Apify actor name)
2
  import os
3
  import re
4
  import logging
 
10
  from typing import Optional, Dict, Any
11
 
12
  # --- Frameworks ---
 
13
  from starlette.applications import Starlette
14
+ from starlette.routing import Route
15
+ from starlette.responses import PlainTextResponse, JSONResponse, Response
16
+ from starlette.requests import Request
17
 
18
  # --- Telegram Bot ---
19
  from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
 
27
  )
28
  from telegram.constants import ParseMode
29
  from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError
30
+ from telegram.request import HTTPXRequest, BaseRequest # Import BaseRequest for type hinting
31
 
32
  # --- Other Libraries ---
33
  import httpx
34
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
35
+ # Make requests optional if only used for sync fallback (currently not)
36
+ # import requests
37
  from bs4 import BeautifulSoup
38
  from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log
39
+ # Optional: Import lxml if installed (usually faster parsing)
40
+ try:
41
+ import lxml
42
+ DEFAULT_PARSER = 'lxml'
43
+ except ImportError:
44
+ DEFAULT_PARSER = 'html.parser'
45
+
46
 
47
  _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
48
  if _apify_token_exists:
49
  from apify_client import ApifyClient
50
  from apify_client.consts import ActorJobStatus
51
+ from apify_client.errors import ApifyApiError # Import specific error
52
  else:
53
+ ApifyClient = None # type: ignore
54
+ ApifyApiError = None # type: ignore
55
 
56
 
57
  # --- Logging Setup ---
58
  logging.basicConfig(
59
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
60
+ level=logging.INFO
61
  )
62
  logging.getLogger("httpx").setLevel(logging.WARNING)
63
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
 
68
  logging.getLogger('uvicorn').setLevel(logging.INFO)
69
  logging.getLogger('starlette').setLevel(logging.INFO)
70
  logger = logging.getLogger(__name__)
71
+ logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}")
72
 
73
  # --- Global variable for PTB app ---
74
  ptb_app: Optional[Application] = None
 
77
  logger.info("Attempting to load secrets...")
78
  def get_secret(secret_name):
79
  value = os.environ.get(secret_name)
80
+ # Avoid logging full length of very long secrets like Supabase keys
81
+ log_length = min(len(value), 8) if value else 0
82
+ status = "Found" if value else "Not Found"
83
+ logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value[:log_length]}...)")
84
  return value
85
 
86
  TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
 
88
  URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
89
  SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
90
  APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
91
+ WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET') # Added for webhook security
92
+ OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet")
93
+ APIFY_ACTOR_NAME = os.environ.get("APIFY_ACTOR_NAME", "pocesar/youtube-scraper") # Use env var or default
94
  logger.info("Secret loading attempt finished.")
95
+ logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}")
96
+ if _apify_token_exists:
97
+ logger.info(f"Using Apify Actor: {APIFY_ACTOR_NAME}")
98
+
99
 
100
  # --- Retry Decorator for Bot Operations ---
 
 
101
  @retry(
102
+ stop=stop_after_attempt(4),
103
+ wait=wait_exponential(multiplier=1, min=2, max=15),
104
+ retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)),
105
+ before_sleep=before_sleep_log(logger, logging.WARNING),
106
+ reraise=True
107
  )
108
  async def retry_bot_operation(func, *args, **kwargs):
109
  """Wrapper to retry bot operations with exponential backoff."""
 
 
110
  try:
111
  return await func(*args, **kwargs)
112
  except BadRequest as e:
113
+ # Added specific check for common, non-fatal BadRequests
114
+ ignore_errors = [
115
+ "message is not modified",
116
+ "query is too old",
117
+ "message to edit not found",
118
+ "chat not found", # Might indicate user blocked bot, non-retryable
119
+ "bot was blocked by the user",
120
+ ]
121
+ if any(err in str(e).lower() for err in ignore_errors):
122
+ logger.warning(f"Ignoring non-critical BadRequest during bot operation: {e}")
123
+ return None # Indicate no action needed or failed benignly
124
+ logger.error(f"Potentially critical BadRequest during bot operation: {e}")
125
+ raise # Reraise other BadRequests (might be retryable by tenacity)
126
  except TelegramError as e:
127
+ logger.warning(f"TelegramError during bot operation (will retry if applicable): {e}")
128
+ raise
129
  except Exception as e:
130
  logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
131
+ raise
132
+
133
 
134
  # --- Helper Functions ---
135
  def is_youtube_url(url):
136
  """Checks if the URL is a valid YouTube video or shorts URL."""
 
137
  youtube_regex = re.compile(
138
  r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/'
139
  r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?'
140
+ r'([\w-]{11})'
141
+ r'(?:\S+)?',
142
  re.IGNORECASE)
143
  match = youtube_regex.search(url)
144
  logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}")
 
146
 
147
  def extract_youtube_id(url):
148
  """Extracts the YouTube video ID from a URL."""
 
149
  youtube_regex = re.compile(
150
  r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/'
151
  r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?'
152
+ r'([\w-]{11})'
153
  r'(?:\S+)?',
154
  re.IGNORECASE)
155
  match = youtube_regex.search(url)
 
167
  async def fetch_url_content(url: str, timeout: int = 20) -> Optional[str]:
168
  """Fetches content from a URL using httpx asynchronously."""
169
  headers = {
170
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', # Updated UA
171
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
172
+ 'Accept-Language': 'en-US,en;q=0.9',
173
+ 'Connection': 'keep-alive',
174
  }
175
  try:
176
+ async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers, http2=True) as client: # Enable HTTP/2
177
  response = await client.get(url)
178
+ response.raise_for_status()
179
+ # *** FIX: Use response.encoding or response.charset_encoding ***
180
+ # response.encoding will try to decode based on headers/content
181
+ # If it fails, default to utf-8
182
+ try:
183
+ # Accessing .text forces encoding detection
184
+ content = response.text
185
+ logger.debug(f"Detected encoding for {url}: {response.encoding}")
186
+ return content
187
+ except UnicodeDecodeError:
188
+ logger.warning(f"UnicodeDecodeError for {url} with encoding {response.encoding}. Trying raw bytes with utf-8.")
189
+ # Fallback: read bytes and decode utf-8 ignoring errors
190
+ return response.content.decode('utf-8', errors='ignore')
191
+ except Exception as e:
192
+ logger.error(f"Error decoding response for {url}: {e}")
193
+ return None # Cannot decode reliably
194
+
195
  except httpx.HTTPStatusError as e:
196
  logger.error(f"HTTP error fetching {url}: {e.response.status_code} - {e}")
197
+ except httpx.ConnectError as e:
198
+ # Catch specific connection errors like DNS failures
199
+ logger.error(f"Connection error fetching {url}: {e}")
200
+ except httpx.TimeoutException as e:
201
+ logger.error(f"Timeout error fetching {url}: {e}")
202
  except httpx.RequestError as e:
203
  logger.error(f"Request error fetching {url}: {e}")
204
  except Exception as e:
205
  logger.error(f"Unexpected error fetching {url}: {e}", exc_info=True)
206
  return None
207
 
208
+
209
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
210
  """Fetches YouTube transcript using Supadata API."""
211
  if not api_key: return None
 
213
  headers = {'X-API-Key': api_key, 'Accept': 'application/json'}
214
  logger.info(f"Attempting transcript fetch via Supadata for {video_id}")
215
  try:
216
+ # Note: If CERTIFICATE_VERIFY_FAILED persists, it's an issue with api.supadata.net's cert.
217
+ # Do NOT disable verification (verify=False) unless absolutely necessary and understood.
218
  async with httpx.AsyncClient(timeout=30.0) as client:
219
  response = await client.get(api_url, headers=headers)
220
  response.raise_for_status()
 
226
  else:
227
  logger.warning(f"Supadata response format unexpected or empty for {video_id}: {data}")
228
  return None
229
+ except httpx.ConnectError as e:
230
+ # Log SSL verification errors specifically if they occur
231
+ if "CERTIFICATE_VERIFY_FAILED" in str(e):
232
+ logger.error(f"Supadata API SSL certificate verification failed for {video_id}: {e}. This is likely an issue with api.supadata.net's certificate.")
233
+ else:
234
+ logger.error(f"Supadata API connection error for {video_id}: {e}")
235
  except httpx.HTTPStatusError as e:
236
  logger.error(f"Supadata API HTTP error for {video_id}: {e.response.status_code} - {e}")
237
  except Exception as e:
 
240
 
241
  async def get_transcript_via_apify(video_id: str, api_token: str) -> Optional[str]:
242
  """Fetches YouTube transcript using Apify YouTube Scraper Actor."""
243
+ global APIFY_ACTOR_NAME # Use the globally configured/default actor name
244
  if not ApifyClient or not api_token: return None
245
+ logger.info(f"Attempting transcript fetch via Apify (Actor: {APIFY_ACTOR_NAME}) for {video_id}")
246
  try:
247
  client = ApifyClient(api_token)
248
+ # *** FIX: Use the correct actor name ***
249
+ actor = client.actor(APIFY_ACTOR_NAME)
250
+ if not actor:
251
+ logger.error(f"Could not find Apify actor: {APIFY_ACTOR_NAME}")
252
+ return None
253
+
254
+ actor_run = await asyncio.to_thread(
255
+ actor.call, # Run blocking call in thread
256
+ run_input={
257
+ "startUrls": [{"url": f"https://www.youtube.com/watch?v={video_id}"}], # Use correct input format if needed
258
+ "maxResultStreams": 0,
259
+ "maxResults": 1, # Only need info for one video
260
+ "maxResultCommentStreams": 0,
261
+ "proxyConfiguration": {"useApifyProxy": True},
262
  "subtitles": True, # Explicitly request subtitles/transcript
263
+ "maxDurationMinutes": 0, # No duration limit
264
+ "skipComments": True,
265
+ # Check actor docs for exact input schema
266
+ },
267
+ timeout_secs=120, # Timeout for the call itself
268
+ wait_secs=120 # Timeout for waiting for run completion
269
  )
270
 
271
+ if not actor_run or 'defaultDatasetId' not in actor_run:
272
+ logger.warning(f"Apify actor run did not return expected dataset ID for {video_id}. Run details: {actor_run}")
273
+ return None
274
+
275
+ logger.info(f"Apify actor run started/retrieved for {video_id}. Dataset ID: {actor_run['defaultDatasetId']}")
276
+
277
+ # Fetch results from the dataset
278
+ dataset = client.dataset(actor_run["defaultDatasetId"])
279
+ # Run list_items in thread as it can be blocking I/O
280
+ dataset_page = await asyncio.to_thread(dataset.list_items, limit=5) # Limit items fetched initially
281
+
282
+ if dataset_page and dataset_page.items:
283
+ for item in dataset_page.items:
284
+ # Apify output structure can vary; adapt as needed
285
+ transcript_text = item.get('transcript') # Check common keys
286
+ if not transcript_text and 'subtitles' in item: # Check alternative
287
+ if isinstance(item['subtitles'], list) and len(item['subtitles']) > 0:
288
+ transcript_text = " ".join(line.get('text', '') for line in item['subtitles'][0].get('lines', []))
289
+ elif isinstance(item['subtitles'], str): # Sometimes it's just a string
290
+ transcript_text = item['subtitles']
291
+
292
+ if transcript_text and isinstance(transcript_text, str) and transcript_text.strip():
293
+ logger.info(f"Apify transcript fetched successfully for {video_id} (length: {len(transcript_text)})")
294
+ return transcript_text.strip()
295
+
296
+ logger.warning(f"Apify run completed for {video_id}, but no transcript found in dataset items.")
297
  else:
298
+ logger.warning(f"Apify run completed for {video_id}, but dataset was empty or inaccessible.")
299
 
300
+ except ApifyApiError as e:
301
+ # Catch specific Apify errors like "Actor not found"
302
+ logger.error(f"Apify API error fetching transcript for {video_id} (Actor: {APIFY_ACTOR_NAME}): {e}")
303
  except Exception as e:
304
+ logger.error(f"Unexpected error fetching transcript via Apify for {video_id}: {e}", exc_info=True)
305
  return None
306
 
307
 
 
317
  # 2. Try youtube-transcript-api (Direct method)
318
  logger.info(f"Attempting transcript fetch via youtube-transcript-api for {video_id}")
319
  try:
 
320
  transcript_list = await asyncio.to_thread(YouTubeTranscriptApi.get_transcript, video_id)
321
  transcript = " ".join([item['text'] for item in transcript_list])
322
  logger.info(f"youtube-transcript-api transcript fetched successfully for {video_id} (length: {len(transcript)})")
 
342
  return None
343
 
344
  try:
 
345
  def parse_html(content):
346
+ # Use lxml if available, otherwise html.parser
347
+ soup = BeautifulSoup(content, DEFAULT_PARSER)
348
+ for script_or_style in soup(["script", "style", "nav", "footer", "aside", "header", "form", "button", "iframe"]):
349
  script_or_style.decompose()
350
+ # Consider targeting specific elements like <article>, <main>, .post-content etc.
351
+ main_content = soup.find('article') or soup.find('main') or soup.body
352
+ if not main_content: main_content = soup # Fallback to whole soup if no main tags
353
+
354
+ text = main_content.get_text(separator='\n', strip=True)
355
  lines = (line.strip() for line in text.splitlines())
356
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
357
  text = '\n'.join(chunk for chunk in chunks if chunk)
 
359
 
360
  text_content = await asyncio.to_thread(parse_html, html_content)
361
 
362
+ if text_content and len(text_content) > 100:
363
  logger.info(f"Successfully scraped content via requests/BeautifulSoup for {url} (length: {len(text_content)})")
364
  return text_content
365
  else:
 
374
  if not api_key: return None
375
  api_endpoint = "https://api.urltotext.ai/text"
376
  headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
377
+ payload = {"url": url, "text_only": True}
378
  logger.info(f"Attempting website content fetch via UrlToText API for: {url}")
379
 
380
  try:
381
+ async with httpx.AsyncClient(timeout=45.0) as client:
382
  response = await client.post(api_endpoint, headers=headers, json=payload)
383
  response.raise_for_status()
384
  data = response.json()
 
389
  else:
390
  logger.warning(f"UrlToText API response did not contain text for {url}. Response: {data}")
391
  return None
392
+ except httpx.ConnectError as e:
393
+ # Catch DNS error specifically if needed, but general ConnectError covers it
394
+ logger.error(f"UrlToText API connection error for {url}: {e}. Check network/DNS.")
395
  except httpx.HTTPStatusError as e:
396
  logger.error(f"UrlToText API HTTP error for {url}: {e.response.status_code} - {e}")
397
  except Exception as e:
 
401
  # --- Summarization Function ---
402
  async def generate_summary(content: str, summary_type: str, api_key: Optional[str]) -> str:
403
  """Generates a summary using OpenRouter API."""
404
+ global OPENROUTER_MODEL # Use the globally configured/default model
405
  if not api_key:
406
  return "Error: OpenRouter API key is not configured."
407
  if not content:
408
  return "Error: No content provided to summarize."
409
 
 
410
  if len(content) < 50:
411
  return "The provided content is too short to summarize effectively."
412
 
 
 
 
413
  max_chars = 100000
414
  if len(content) > max_chars:
415
  logger.warning(f"Content length ({len(content)}) exceeds max_chars ({max_chars}), truncating.")
416
  content = content[:max_chars]
417
 
418
  prompt_template = """
419
+ Please summarize the following text. The summary should capture the key points and main ideas accurately and concisely.
420
  Provide the summary in {format_style} format.
421
 
422
  Text to summarize:
 
427
  Summary ({format_style}):
428
  """
429
  format_style = "a concise paragraph" if summary_type == "paragraph" else "bullet points (using * or - for each point)"
 
430
  prompt = prompt_template.format(text=content, format_style=format_style)
431
 
432
+ logger.info(f"Sending request to OpenRouter (Model: {OPENROUTER_MODEL}) for {summary_type} summary.")
 
 
 
433
 
434
  try:
435
+ async with httpx.AsyncClient(timeout=120.0) as client:
436
  response = await client.post(
437
  url="https://openrouter.ai/api/v1/chat/completions",
438
  headers={
439
  "Authorization": f"Bearer {api_key}",
440
+ "Content-Type": "application/json",
441
+ # Optional: Add custom site identifier
442
+ # "HTTP-Referer": "YOUR_SITE_URL",
443
+ # "X-Title": "Telegram Summarizer Bot"
444
  },
445
  json={
446
+ "model": OPENROUTER_MODEL,
447
  "messages": [{"role": "user", "content": prompt}],
448
+ "max_tokens": 1024, # Adjust based on expected summary length
449
+ # Optional: Add temperature, top_p etc. if needed
450
  },
451
  )
452
  response.raise_for_status()
 
456
  summary = data["choices"][0].get("message", {}).get("content", "").strip()
457
  if summary:
458
  logger.info(f"Summary generated successfully (length: {len(summary)})")
459
+ # More robust Markdown escaping needed for PTB's MarkdownV2
460
+ # For simple Markdown, basic escaping might suffice, but V2 is safer
461
+ # summary = escape_markdown(summary) # Implement or import escape_markdown if using V2
462
+ # Basic escaping for ParseMode.MARKDOWN:
463
  summary = summary.replace('_', r'\_').replace('*', r'\*').replace('[', r'\[').replace('`', r'\`')
464
  return summary
465
  else:
466
  logger.error("OpenRouter response successful, but summary content is empty.")
467
  return "Sorry, the AI generated an empty summary. Please try again."
468
  else:
469
+ # Log the error details if available in the response
470
+ error_details = data.get("error")
471
+ logger.error(f"OpenRouter response format unexpected or error: {error_details or data}")
472
+ return f"Sorry, I received an unexpected response or error from the summarization service: {error_details}"
473
 
474
  except httpx.HTTPStatusError as e:
475
  error_body = ""
476
+ try: error_body = e.response.text
477
+ except Exception: pass
 
 
478
  logger.error(f"OpenRouter API HTTP error: {e.response.status_code} - {e}. Response body: {error_body}")
479
  return f"Sorry, there was an error communicating with the summarization service (HTTP {e.response.status_code})."
480
  except Exception as e:
 
486
  async def process_summary_task(
487
  user_id: int,
488
  chat_id: int,
489
+ message_id_to_edit: Optional[int],
490
  url: str,
491
  summary_type: str,
492
  bot_token: str
 
495
  task_id = f"{user_id}-{message_id_to_edit or 'new'}"
496
  logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
497
 
498
+ # Create a new request handler and bot instance for this task
499
+ background_request: Optional[BaseRequest] = None
500
+ bot: Optional[Bot] = None
501
+ try:
502
+ background_request = HTTPXRequest(
503
+ connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0, http_version="1.1"
504
+ )
505
+ bot = Bot(token=bot_token, request=background_request)
506
+ except Exception as e:
507
+ logger.critical(f"[Task {task_id}] Failed to create background bot instance: {e}", exc_info=True)
508
+ # Cannot proceed without a bot instance
509
+ return # Or raise? Silently failing might hide issues.
510
 
511
  content = None
512
  user_feedback_message = None
513
  success = False
514
  final_summary = ""
515
+ status_message_id = message_id_to_edit
516
 
517
  try:
518
+ # --- Inform User Processing Has Started ---
519
  processing_message_text = f"⏳ Working on your '{summary_type}' summary for:\n`{url}`\n\n_(Fetching & summarizing...)_"
 
520
  if status_message_id:
521
  try:
522
  await retry_bot_operation(
523
+ bot.edit_message_text, chat_id=chat_id, message_id=status_message_id,
524
+ text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None
 
 
 
 
525
  )
526
  logger.debug(f"[Task {task_id}] Successfully edited message {status_message_id} to 'Processing'")
527
  except Exception as e:
528
  logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Will send a new status message.")
529
+ status_message_id = None
530
+ if not status_message_id:
 
531
  try:
532
  status_message = await retry_bot_operation(
533
+ bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN
 
 
 
534
  )
535
+ if status_message: # Check if message was actually sent (retry might return None on ignore)
536
+ status_message_id = status_message.message_id
537
+ logger.debug(f"[Task {task_id}] Sent new status message {status_message_id}")
538
+ else:
539
+ logger.error(f"[Task {task_id}] Failed to send new status message after retries.")
540
+ raise RuntimeError("Failed to send initial status message")
541
  except Exception as e:
542
  logger.error(f"[Task {task_id}] Failed to send new status message: {e}")
 
543
  raise RuntimeError("Failed to send initial status message") from e
544
 
545
  # --- Main Content Fetching and Summarization ---
546
  try:
547
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
 
 
548
  is_yt = is_youtube_url(url)
549
  logger.debug(f"[Task {task_id}] URL is YouTube: {is_yt}")
 
550
  if is_yt:
551
  video_id = extract_youtube_id(url)
552
  if video_id:
553
  logger.info(f"[Task {task_id}] Fetching YouTube transcript for {video_id}")
554
  content = await get_youtube_transcript(video_id, url, SUPADATA_API_KEY, APIFY_API_TOKEN)
555
+ if not content: user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video. It might be disabled or unavailable."
556
+ else: user_feedback_message = "⚠️ Couldn't extract a valid YouTube video ID from the link."
 
 
557
  else:
558
  logger.info(f"[Task {task_id}] Attempting website scrape for: {url}")
559
  content = await get_website_content_via_requests(url)
560
  if not content and URLTOTEXT_API_KEY:
561
+ logger.info(f"[Task {task_id}] Basic scrape failed/insufficient, trying UrlToText API...")
562
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
563
  content = await get_website_content_via_urltotext_api(url, URLTOTEXT_API_KEY)
564
+ if not content: user_feedback_message = "⚠️ Sorry, I couldn't fetch or extract meaningful content from that website."
565
 
 
 
 
 
566
  if content:
567
  logger.info(f"[Task {task_id}] Content fetched (length: {len(content)}). Generating '{summary_type}' summary.")
568
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
 
569
  final_summary = await generate_summary(content, summary_type, OPENROUTER_API_KEY)
 
570
  if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
571
+ user_feedback_message = f"⚠️ {final_summary}"
572
+ else: success = True
573
+ # If content fetching failed, user_feedback_message is already set
 
 
574
 
575
  except Exception as e:
576
  logger.error(f"[Task {task_id}] Error during content fetching or summarization: {e}", exc_info=True)
 
578
 
579
  # --- Send Final Result or Error ---
580
  if success and final_summary:
 
581
  max_length = 4096
582
  summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
 
 
583
  await retry_bot_operation(
584
+ bot.send_message, chat_id=chat_id, text=summary_parts[0],
585
+ parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True}
 
 
 
586
  )
 
587
  for part in summary_parts[1:]:
588
+ await asyncio.sleep(0.5)
589
  await retry_bot_operation(
590
+ bot.send_message, chat_id=chat_id, text=part,
591
+ parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True}
 
 
 
592
  )
593
  logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
594
+ elif user_feedback_message:
 
595
  logger.warning(f"[Task {task_id}] Sending feedback/error message: {user_feedback_message}")
596
  await retry_bot_operation(
597
+ bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True}
 
 
 
598
  )
599
+ else:
 
600
  logger.error(f"[Task {task_id}] Reached end of task without success or specific error message.")
601
  await retry_bot_operation(
602
+ bot.send_message, chat_id=chat_id, text="❓ Something went wrong, but no specific error was identified.",
 
 
603
  link_preview_options={'is_disabled': True}
604
  )
605
 
 
606
  except Exception as e:
607
  logger.critical(f"[Task {task_id}] Critical error within task processing: {e}", exc_info=True)
608
  try:
609
+ # Use the bot instance created at the start of the task if available
610
+ if bot:
611
+ await retry_bot_operation(
612
+ bot.send_message, chat_id=chat_id,
613
+ text="❌ A critical internal error occurred. Please report this if it persists."
614
+ )
615
+ else:
616
+ logger.error("[Task ??] Cannot send critical error message: Bot instance not available.")
617
  except Exception:
618
  logger.exception(f"[Task {task_id}] Failed even to send critical error message.")
619
  finally:
620
  # --- Clean up Status Message ---
621
+ if status_message_id and bot: # Ensure bot exists before trying to delete
622
  try:
623
  await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=status_message_id)
624
  logger.debug(f"[Task {task_id}] Deleted status message {status_message_id}")
625
  except Exception as e:
626
+ # Log benignly if deletion fails (e.g., message already deleted)
627
  logger.warning(f"[Task {task_id}] Failed to delete status message {status_message_id}: {e}")
628
 
629
+ # --- Clean up Background Bot's HTTPX Client ---
630
+ # *** FIX: Correct way to close client for manually created Bot ***
631
+ if background_request and hasattr(background_request, '_client') and background_request._client:
632
  try:
633
+ await background_request._client.aclose()
634
+ logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
635
  except Exception as e:
636
+ logger.warning(f"[Task {task_id}] Error closing background bot's HTTPX client: {e}")
637
+ else:
638
+ logger.debug(f"[Task {task_id}] Background bot's HTTPX client already closed or not found.")
639
 
640
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
641
 
642
 
643
+ # --- Telegram Bot Handlers (Mostly Unchanged) ---
644
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
645
  """Handles the /start command."""
646
  user = update.effective_user
647
  if not user or not update.message: return
648
  logger.info(f"User {user.id} initiated /start.")
649
+ mention = user.mention_html()
650
  start_message = (
651
  f"👋 Hello {mention}!\n\n"
652
  "I can summarise YouTube videos or web articles for you.\n\n"
 
667
  "3. Click the button for your preferred format.\n"
668
  "4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
669
  "**Important Notes:**\n"
670
+ "- **YouTube:** Transcript availability varies. I try multiple methods.\n"
671
+ "- **Websites:** I attempt basic scraping and can use UrlToText API (if configured) for complex sites.\n"
672
+ "- **AI Summaries:** Provided by OpenRouter (using model: `{model}`). Accuracy may vary.\n"
673
+ "- **Length Limits:** Very long content might be truncated.\n\n"
674
  "Just send a link to get started!"
675
+ ).format(model=OPENROUTER_MODEL) # Show the model being used
 
676
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
677
 
678
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
682
  user = update.effective_user
683
  if not user: return
684
 
 
685
  url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
686
  match = re.search(url_pattern, message_text)
687
 
688
  if match:
689
  url = match.group(0)
690
+ url = re.sub(r'[.,!?)\]>]+$', '', url) # Basic cleanup
691
+ # Further clean URL if needed, e.g., removing tracking params (complex)
692
  logger.info(f"User {user.id} sent potential URL: {url}")
693
 
 
694
  context.user_data['url_to_summarize'] = url
695
+ context.user_data['original_message_id'] = update.message.message_id
696
 
697
  keyboard = [
698
  [
699
+ InlineKeyboardButton("📜 Paragraph", callback_data="paragraph"),
700
  InlineKeyboardButton("🔹 Bullet Points", callback_data="points")
701
  ]
702
  ]
 
708
  link_preview_options={'is_disabled': True}
709
  )
710
  elif not message_text.startswith('/'):
 
711
  logger.debug(f"User {user.id} sent non-URL, non-command text: '{message_text[:50]}...'")
712
+ if "http" in message_text or "www." in message_text or ".com" in message_text or ".org" in message_text or ".net" in message_text:
 
713
  await update.message.reply_text("Hmm, that looks like it might be a link, but please ensure it starts with `http://` or `https://` and is a valid URL.")
714
+
715
 
716
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
717
  """Handles button presses for summary type selection."""
718
  query = update.callback_query
719
  if not query or not query.message or not query.from_user:
720
  logger.warning("Callback query received without essential data.")
721
+ if query: await query.answer()
722
  return
723
 
724
  user = query.from_user
725
  summary_type = query.data
726
  query_id = query.id
727
 
 
 
728
  try:
729
  await query.answer()
730
  logger.debug(f"Acknowledged callback query {query_id} from user {user.id}")
731
  except Exception as e:
 
732
  logger.error(f"Error answering callback query {query_id} from user {user.id}: {e}", exc_info=True)
 
 
733
 
 
734
  url = context.user_data.get('url_to_summarize')
 
735
  message_id_to_edit = query.message.message_id
736
 
737
  logger.info(f"User {user.id} chose summary type '{summary_type}' for URL associated with message {message_id_to_edit}")
 
739
  if not url:
740
  logger.warning(f"No URL found in user_data for user {user.id} (callback query {query_id}). Editing message.")
741
  try:
742
+ # Edit the message the button was attached to
743
  await query.edit_message_text(text="⚠️ Oops! I couldn't find the link associated with this request. Please send the link again.")
744
  except Exception as e:
745
  logger.error(f"Failed to edit message to show 'URL not found' error: {e}")
746
  return
747
 
 
748
  context.user_data.pop('url_to_summarize', None)
749
+ context.user_data.pop('original_message_id', None)
750
 
751
  if not TELEGRAM_TOKEN:
752
  logger.critical("TELEGRAM_TOKEN is missing, cannot start background task!")
 
760
  process_summary_task(
761
  user_id=user.id,
762
  chat_id=query.message.chat_id,
763
+ message_id_to_edit=message_id_to_edit,
764
  url=url,
765
  summary_type=summary_type,
766
+ bot_token=TELEGRAM_TOKEN
767
  ),
768
  name=f"SummaryTask-{user.id}-{message_id_to_edit}"
769
  )
770
 
771
 
772
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
773
+ """Log Errors caused by Updates."""
774
+ # Ignore errors related to background task exceptions that were already handled/logged
775
+ if isinstance(context.error, AttributeError) and "'Bot' object has no attribute 'session'" in str(context.error):
776
+ logger.debug(f"Ignoring known cleanup error in error_handler: {context.error}")
777
+ return
778
+
779
  logger.error("Exception while handling an update:", exc_info=context.error)
780
 
781
+ # Example: Inform user on specific, potentially temporary errors
 
782
  # if isinstance(context.error, (NetworkError, TimedOut)):
783
+ # try:
784
+ # if update and isinstance(update, Update) and update.effective_chat:
785
+ # await context.bot.send_message(
786
+ # chat_id=update.effective_chat.id,
787
+ # text="I'm having temporary network issues. Please try again in a moment."
788
+ # )
789
+ # except Exception as e:
790
+ # logger.error(f"Failed to send error notification to user: {e}")
791
 
792
 
793
  # --- Bot Setup Function ---
 
797
  if not TELEGRAM_TOKEN:
798
  raise ValueError("TELEGRAM_TOKEN environment variable not set.")
799
 
 
800
  custom_request = HTTPXRequest(
801
+ connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0, http_version="1.1"
 
 
 
 
802
  )
803
 
804
  application = (
805
  Application.builder()
806
  .token(TELEGRAM_TOKEN)
807
  .request(custom_request)
 
 
808
  .build()
809
  )
810
 
 
811
  application.add_handler(CommandHandler("start", start))
812
  application.add_handler(CommandHandler("help", help_command))
 
 
813
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
814
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
 
 
815
  application.add_error_handler(error_handler)
816
 
817
  logger.info("Telegram application handlers configured.")
818
  return application
819
 
820
+ # --- ASGI Lifespan Context Manager ---
821
  @contextlib.asynccontextmanager
822
  async def lifespan(app: Starlette):
823
  """Handles PTB startup and shutdown during ASGI lifespan."""
824
+ global ptb_app, WEBHOOK_SECRET # Make secret global for access in webhook handler
825
  logger.info("ASGI Lifespan: Startup sequence initiated...")
826
 
827
  if not TELEGRAM_TOKEN:
 
831
  bot_info_text = "Bot info not available yet."
832
  try:
833
  ptb_app = await setup_bot_config()
834
+ await ptb_app.initialize()
 
835
  bot_info = await ptb_app.bot.get_me()
836
  bot_info_text = f"@{bot_info.username} (ID: {bot_info.id})"
837
  logger.info(f"Bot initialized: {bot_info_text}")
838
 
 
839
  current_webhook_info = await ptb_app.bot.get_webhook_info()
840
  if current_webhook_info and current_webhook_info.url:
841
  logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete it.")
842
  try:
843
+ # Use drop_pending_updates=False if you want to process updates accumulated while down
844
+ if await ptb_app.bot.delete_webhook(drop_pending_updates=True):
845
+ logger.info("Existing webhook deleted successfully.")
846
+ else:
847
+ logger.warning("Failed to delete existing webhook (API returned False).")
848
  except Exception as e:
849
  logger.warning(f"Could not delete existing webhook: {e}")
850
+ await asyncio.sleep(1)
851
 
 
 
852
  space_host = os.environ.get("SPACE_HOST")
853
+ webhook_path = "/webhook"
854
  full_webhook_url = None
 
855
  if space_host:
856
+ protocol = "https://" # Assume HTTPS for HF Spaces
857
+ host = space_host.split('://')[-1] # Get host part regardless of existing protocol
858
+ full_webhook_url = f"{protocol}{host.rstrip('/')}{webhook_path}"
 
 
 
 
 
859
 
860
  if full_webhook_url:
861
  logger.info(f"Attempting to set webhook to: {full_webhook_url}")
862
+ # Use secret token if configured
863
+ set_webhook_args = {
864
+ "url": full_webhook_url,
865
+ "allowed_updates": Update.ALL_TYPES,
866
+ "drop_pending_updates": True,
867
+ }
868
+ if WEBHOOK_SECRET:
869
+ set_webhook_args["secret_token"] = WEBHOOK_SECRET
870
+ logger.info("Webhook will be set with a secret token.")
871
+
872
+ await asyncio.sleep(1.0) # Slightly shorter wait
873
  try:
874
+ await ptb_app.bot.set_webhook(**set_webhook_args)
 
 
 
 
 
875
  webhook_info = await ptb_app.bot.get_webhook_info()
876
+ # Check if the URL and secret status match expectations
877
+ if webhook_info.url == full_webhook_url:
878
+ logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Pending={webhook_info.pending_update_count}, Secret={bool(WEBHOOK_SECRET)}")
879
+ else:
880
+ logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'")
881
+ raise RuntimeError("Webhook URL mismatch after setting.")
882
+
883
  await ptb_app.start()
884
  logger.info("PTB Application started (webhook mode). Ready for updates.")
885
  except Exception as e:
886
  logger.error(f"FATAL: Failed to set webhook to {full_webhook_url}: {e}", exc_info=True)
 
 
887
  raise RuntimeError(f"Failed to set webhook: {e}") from e
888
  else:
889
+ logger.critical("Could not construct valid HTTPS webhook URL from SPACE_HOST.")
 
890
  raise RuntimeError("Webhook URL could not be determined.")
891
  else:
892
+ logger.critical("SPACE_HOST environment variable not found. Cannot set webhook for HF Space.")
 
 
 
 
893
  raise RuntimeError("SPACE_HOST env var missing, cannot run in webhook mode.")
894
 
 
895
  logger.info("ASGI Lifespan: Startup complete.")
896
  yield # Application runs here
897
 
898
  except Exception as startup_err:
899
  logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
 
 
 
900
  if ptb_app:
901
+ if ptb_app.running: await ptb_app.stop()
902
  await ptb_app.shutdown()
903
+ raise
904
  finally:
905
  logger.info("ASGI Lifespan: Shutdown sequence initiated...")
906
  if ptb_app:
 
908
  logger.info("Stopping PTB application...")
909
  await ptb_app.stop()
910
  logger.info("Shutting down PTB application...")
911
+ await ptb_app.shutdown() # This closes the main bot's request client
912
  logger.info("PTB Application shut down gracefully.")
913
  else:
914
  logger.info("PTB application was not initialized or startup failed.")
 
921
  bot_status = "Not Initialized"
922
  if ptb_app and ptb_app.bot:
923
  try:
 
924
  if ptb_app.running:
925
+ # Using a flag or cached info is better than get_me repeatedly
926
+ bot_info = await ptb_app.bot.get_me()
927
  bot_status = f"Running (@{bot_info.username})"
928
  else:
929
  bot_status = "Initialized but not running"
930
  except Exception as e:
931
  bot_status = f"Error checking status: {e}"
932
+ return PlainTextResponse(f"Telegram Bot Summarizer - Status: {bot_status}\nModel: {OPENROUTER_MODEL}\nApify Actor: {APIFY_ACTOR_NAME if _apify_token_exists else 'N/A'}")
933
 
 
934
 
935
  async def telegram_webhook(request: Request) -> Response:
936
  """Webhook endpoint called by Telegram."""
937
+ global WEBHOOK_SECRET # Access the global secret
938
  if not ptb_app:
939
  logger.error("Webhook received but PTB application not initialized.")
940
  return PlainTextResponse('Bot not initialized', status_code=503)
941
  if not ptb_app.running:
942
  logger.warning("Webhook received but PTB application not running.")
 
 
943
  return PlainTextResponse('Bot initialized but not running', status_code=503)
944
 
945
  try:
946
+ # Verify secret token if configured
947
+ if WEBHOOK_SECRET:
948
+ token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
949
+ if token_header != WEBHOOK_SECRET:
950
+ logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'")
951
+ return Response(content="Invalid secret token", status_code=403) # Forbidden
952
+
 
953
  update_data = await request.json()
954
  update = Update.de_json(data=update_data, bot=ptb_app.bot)
955
+ logger.debug(f"Processing update_id: {update.update_id} via webhook")
956
+ # PTB's process_update runs the handlers
957
  await ptb_app.process_update(update)
958
+ # Return 200 OK quickly to Telegram
959
+ return Response(status_code=200)
960
 
961
  except json.JSONDecodeError:
962
  logger.error("Webhook received invalid JSON.")
963
  return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
964
  except Exception as e:
965
  logger.error(f"Error processing webhook update: {e}", exc_info=True)
966
+ # Return 200 OK to Telegram even if processing failed, to avoid retries for app logic errors
967
+ return Response(status_code=200)
 
968
 
969
  # --- Create Starlette ASGI Application ---
970
  app = Starlette(
971
+ debug=False,
972
  lifespan=lifespan,
973
  routes=[
974
  Route("/", endpoint=health_check, methods=["GET"]),
 
978
  logger.info("Starlette ASGI application created with native routes.")
979
 
980
  # --- Development Server Execution Block (Optional) ---
 
 
981
  if __name__ == '__main__':
982
  import uvicorn
983
  logger.warning("Running in development mode using Uvicorn directly (not for production)")
984
+ # Use LOGGING_LEVEL env var or default to info
985
+ log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
986
+ local_port = int(os.environ.get('PORT', 8080))
987
+ # Run Uvicorn with the app instance
988
+ uvicorn.run("__main__:app", host='0.0.0.0', port=local_port, log_level=log_level, reload=True) # Add reload for dev