fmab777 commited on
Commit
b3e1b64
·
verified ·
1 Parent(s): cd72c3f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +662 -220
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Revised with background task connection fixes)
2
  import os
3
  import re
4
  import logging
@@ -7,13 +7,14 @@ import json
7
  import html
8
  import contextlib
9
  import traceback
10
- from typing import Optional
11
 
12
  # --- Frameworks ---
13
- from flask import Flask, request, Response
14
  from starlette.applications import Starlette
15
- from starlette.routing import Mount
16
- from starlette.middleware.wsgi import WSGIMiddleware
 
17
 
18
  # --- Telegram Bot ---
19
  from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
@@ -26,26 +27,28 @@ from telegram.ext import (
26
  CallbackQueryHandler,
27
  )
28
  from telegram.constants import ParseMode
29
- from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest
30
  from telegram.request import HTTPXRequest
31
 
32
  # --- Other Libraries ---
33
  import httpx
34
- from youtube_transcript_api import YouTubeTranscriptApi
35
  import requests
36
  from bs4 import BeautifulSoup
37
- from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
38
 
39
  _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
40
  if _apify_token_exists:
41
  from apify_client import ApifyClient
 
42
  else:
43
- ApifyClient = None
 
44
 
45
  # --- Logging Setup ---
46
  logging.basicConfig(
47
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
48
- level=logging.DEBUG
49
  )
50
  logging.getLogger("httpx").setLevel(logging.WARNING)
51
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
@@ -77,36 +80,60 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
77
  logger.info("Secret loading attempt finished.")
78
 
79
  # --- Retry Decorator for Bot Operations ---
80
- def retry_bot_operation(func):
81
- @retry(
82
- stop=stop_after_attempt(3),
83
- wait=wait_exponential(multiplier=1, min=1, max=10),
84
- retry=retry_if_exception_type((NetworkError, RuntimeError)),
85
- before_sleep=lambda retry_state: logger.warning(
86
- f"Retrying bot operation due to {retry_state.outcome.exception()}. "
87
- f"Attempt {retry_state.attempt_number}/3"
88
- )
89
- )
90
- async def wrapper(*args, **kwargs):
91
- try:
92
- return await func(*args, **kwargs)
93
- except Exception as e:
94
- logger.error(f"Operation failed after retries: {e}")
95
- raise
96
- return wrapper
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- # --- Helper Functions (unchanged from your original) ---
99
  def is_youtube_url(url):
100
  """Checks if the URL is a valid YouTube video or shorts URL."""
101
- youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
102
- match = re.search(youtube_regex, url)
 
 
 
 
 
 
103
  logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}")
104
  return bool(match)
105
 
106
  def extract_youtube_id(url):
107
  """Extracts the YouTube video ID from a URL."""
108
- youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})(?:\?|&|\s|$)'
109
- match = re.search(youtube_id_regex, url)
 
 
 
 
 
 
110
  if match:
111
  video_id = match.group(1)
112
  logger.debug(f"Extracted YouTube ID '{video_id}' from URL: {url}")
@@ -115,66 +142,327 @@ def extract_youtube_id(url):
115
  logger.warning(f"Could not extract YouTube ID from URL: {url}")
116
  return None
117
 
118
- # --- Content Fetching Functions (unchanged from your original) ---
119
- # [Keep all your existing content fetching functions exactly as they were]
120
- # get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript,
121
- # get_website_content_via_requests, get_website_content_via_urltotext_api, generate_summary
122
 
123
- # --- Revised Background Task Processing ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  async def process_summary_task(
125
  user_id: int,
126
  chat_id: int,
127
- message_id_to_edit: int,
128
  url: str,
129
  summary_type: str,
130
- bot_token: str # Now receiving token instead of bot instance
131
  ) -> None:
132
  """Handles the actual fetching and summarization in a background task."""
133
- task_id = f"{user_id}-{message_id_to_edit}"
134
  logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
135
-
136
  # Create a new bot instance for this task
137
- bot = Bot(token=bot_token)
138
-
 
 
 
 
 
 
 
 
 
 
139
  try:
140
- # --- Inform User Processing Has Started ---
141
- processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
142
- status_message_sent_id = None
143
-
144
- @retry_bot_operation
145
- async def edit_or_send_status():
146
- nonlocal status_message_sent_id, message_id_to_edit
147
  try:
148
- await bot.edit_message_text(
149
- chat_id=chat_id,
150
- message_id=message_id_to_edit,
151
- text=processing_message_text
152
- )
153
- logger.debug(f"[Task {task_id}] Successfully edited message {message_id_to_edit}")
154
- except (TimedOut, NetworkError, BadRequest) as e:
155
- logger.warning(f"[Task {task_id}] Could not edit original message: {e}. Sending new status message.")
156
- message_id_to_edit = None
157
- status_message = await bot.send_message(
158
  chat_id=chat_id,
159
- text=processing_message_text
 
 
 
160
  )
161
- status_message_sent_id = status_message.message_id
162
- logger.debug(f"[Task {task_id}] Sent new status message {status_message_sent_id}")
163
-
164
- await edit_or_send_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  # --- Main Content Fetching and Summarization ---
167
- content = None
168
- user_feedback_message = None
169
- success = False
170
-
171
  try:
172
- # Send 'typing' action
173
- @retry_bot_operation
174
- async def send_typing():
175
- await bot.send_chat_action(chat_id=chat_id, action='typing')
176
-
177
- await send_typing()
178
 
179
  # --- Determine Content Type and Fetch ---
180
  is_yt = is_youtube_url(url)
@@ -184,98 +472,122 @@ async def process_summary_task(
184
  video_id = extract_youtube_id(url)
185
  if video_id:
186
  logger.info(f"[Task {task_id}] Fetching YouTube transcript for {video_id}")
187
- content = await get_youtube_transcript(
188
- video_id,
189
- url,
190
- SUPADATA_API_KEY,
191
- APIFY_API_TOKEN
192
- )
193
  if not content:
194
- user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video."
 
 
195
  else:
196
  logger.info(f"[Task {task_id}] Attempting website scrape for: {url}")
197
  content = await get_website_content_via_requests(url)
198
  if not content and URLTOTEXT_API_KEY:
199
- await send_typing()
 
200
  content = await get_website_content_via_urltotext_api(url, URLTOTEXT_API_KEY)
201
- if not content:
202
- user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website."
 
203
 
204
  # --- Generate Summary if Content Was Fetched ---
205
  if content:
206
- logger.info(f"[Task {task_id}] Generating '{summary_type}' summary")
207
- await send_typing()
208
-
209
- summary = await generate_summary(content, summary_type, OPENROUTER_API_KEY)
210
 
211
- if summary.startswith("Error:") or summary.startswith("Sorry,"):
212
- user_feedback_message = f"⚠️ {summary}"
 
 
213
  else:
214
- @retry_bot_operation
215
- async def send_summary():
216
- await bot.send_message(
217
- chat_id=chat_id,
218
- text=summary,
219
- parse_mode=ParseMode.MARKDOWN,
220
- link_preview_options={'is_disabled': True}
221
- )
222
-
223
- await send_summary()
224
- success = True
225
 
226
  except Exception as e:
227
- logger.error(f"[Task {task_id}] Error during processing: {e}", exc_info=True)
228
  user_feedback_message = "❌ An unexpected error occurred while processing your request."
229
 
230
- # --- Send Final Feedback Message if Processing Failed ---
231
- if user_feedback_message and not success:
232
- @retry_bot_operation
233
- async def send_feedback():
234
- await bot.send_message(chat_id=chat_id, text=user_feedback_message)
235
-
236
- await send_feedback()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  except Exception as e:
239
- logger.error(f"[Task {task_id}] Critical error in task: {e}", exc_info=True)
240
  try:
241
- await bot.send_message(
 
242
  chat_id=chat_id,
243
- text="❌ A critical error occurred. Please try again later."
244
  )
245
  except Exception:
246
- pass
247
  finally:
248
- # --- Clean up Status Message(s) ---
249
- try:
250
- if status_message_sent_id:
251
- await bot.delete_message(chat_id=chat_id, message_id=status_message_sent_id)
252
- elif message_id_to_edit and success:
253
- await bot.delete_message(chat_id=chat_id, message_id=message_id_to_edit)
254
- elif message_id_to_edit and not success:
255
- final_error_text = user_feedback_message or "❌ An error occurred."
256
- await bot.edit_message_text(
257
- chat_id=chat_id,
258
- message_id=message_id_to_edit,
259
- text=final_error_text[:4090]
260
- )
261
- except Exception as e:
262
- logger.warning(f"[Task {task_id}] Cleanup error: {e}")
263
-
264
  # Ensure bot session is closed
265
- try:
266
- await bot.session.close()
267
- except Exception:
268
- pass
 
 
269
 
270
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
271
 
 
272
  # --- Telegram Bot Handlers ---
273
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
274
  """Handles the /start command."""
275
  user = update.effective_user
276
- if not user: return
277
  logger.info(f"User {user.id} initiated /start.")
278
- mention = user.mention_html() if user.username else user.first_name
279
  start_message = (
280
  f"👋 Hello {mention}!\n\n"
281
  "I can summarise YouTube videos or web articles for you.\n\n"
@@ -287,7 +599,8 @@ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
287
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
288
  """Handles the /help command."""
289
  user = update.effective_user
290
- logger.info(f"User {user.id if user else '?'} requested /help.")
 
291
  help_text = (
292
  "**How to Use Me:**\n"
293
  "1. Send me a direct link (URL) to a YouTube video or a web article.\n"
@@ -296,10 +609,12 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
296
  "4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
297
  "**Important Notes:**\n"
298
  "- **YouTube:** Getting transcripts can sometimes fail if they are disabled or unavailable.\n"
299
- "- **Websites:** Complex websites might not work perfectly.\n"
300
- "- **AI Summaries:** The AI tries its best to be accurate.\n\n"
 
301
  "Just send a link to get started!"
302
  )
 
303
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
304
 
305
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
@@ -309,17 +624,23 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
309
  user = update.effective_user
310
  if not user: return
311
 
312
- url_pattern = r'https?://(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(?:/[^\s]*)?'
 
313
  match = re.search(url_pattern, message_text)
314
 
315
  if match:
316
  url = match.group(0)
317
- logger.info(f"User {user.id} sent URL: {url}")
 
 
 
 
318
  context.user_data['url_to_summarize'] = url
 
319
 
320
  keyboard = [
321
  [
322
- InlineKeyboardButton("📜 Paragraph Summary", callback_data="paragraph"),
323
  InlineKeyboardButton("🔹 Bullet Points", callback_data="points")
324
  ]
325
  ]
@@ -331,55 +652,89 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
331
  link_preview_options={'is_disabled': True}
332
  )
333
  elif not message_text.startswith('/'):
334
- await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
 
 
 
 
 
335
 
336
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
337
  """Handles button presses for summary type selection."""
338
  query = update.callback_query
339
- if not query or not query.from_user or not query.message:
340
- try:
341
- await query.answer()
342
- except:
343
- pass
344
  return
345
 
346
- await query.answer() # Acknowledge the button press immediately
347
-
348
  user = query.from_user
349
  summary_type = query.data
350
- url = context.user_data.get('url_to_summarize')
351
  query_id = query.id
352
 
353
- logger.info(f"User {user.id} chose summary type '{summary_type}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  if not url:
356
- logger.warning(f"No URL found for user {user.id}")
357
  try:
358
- await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
359
  except Exception as e:
360
- logger.error(f"Failed to edit message: {e}")
361
  return
362
 
363
- # Clear the URL from context
364
  context.user_data.pop('url_to_summarize', None)
 
365
 
366
- # Schedule background task with token instead of bot instance
 
 
 
 
 
 
 
367
  asyncio.create_task(
368
  process_summary_task(
369
  user_id=user.id,
370
  chat_id=query.message.chat_id,
371
- message_id_to_edit=query.message.message_id,
372
  url=url,
373
  summary_type=summary_type,
374
- bot_token=TELEGRAM_TOKEN
375
  ),
376
- name=f"SummaryTask-{user.id}-{query.message.message_id}"
377
  )
378
 
 
379
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
380
  """Log Errors caused by Updates or background tasks."""
381
- if context.error:
382
- logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
 
 
 
 
 
 
 
 
 
383
 
384
  # --- Bot Setup Function ---
385
  async def setup_bot_config() -> Application:
@@ -388,124 +743,211 @@ async def setup_bot_config() -> Application:
388
  if not TELEGRAM_TOKEN:
389
  raise ValueError("TELEGRAM_TOKEN environment variable not set.")
390
 
 
391
  custom_request = HTTPXRequest(
392
  connect_timeout=10.0,
393
- read_timeout=30.0,
394
  write_timeout=30.0,
395
- pool_timeout=30.0,
396
- http_version="1.1"
397
  )
398
 
399
  application = (
400
  Application.builder()
401
  .token(TELEGRAM_TOKEN)
402
  .request(custom_request)
 
 
403
  .build()
404
  )
405
 
 
406
  application.add_handler(CommandHandler("start", start))
407
  application.add_handler(CommandHandler("help", help_command))
 
 
408
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
409
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
 
 
410
  application.add_error_handler(error_handler)
411
 
412
  logger.info("Telegram application handlers configured.")
413
  return application
414
 
415
- # --- ASGI Lifespan Context Manager ---
416
  @contextlib.asynccontextmanager
417
  async def lifespan(app: Starlette):
418
  """Handles PTB startup and shutdown during ASGI lifespan."""
419
  global ptb_app
420
  logger.info("ASGI Lifespan: Startup sequence initiated...")
421
 
 
 
 
 
 
422
  try:
423
  ptb_app = await setup_bot_config()
424
- await ptb_app.initialize()
425
- await ptb_app.start()
426
 
427
  bot_info = await ptb_app.bot.get_me()
428
- logger.info(f"Bot started: @{bot_info.username}")
 
429
 
430
- WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
431
- if WEBHOOK_URL_BASE:
432
- if not WEBHOOK_URL_BASE.startswith("https://"):
433
- WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
434
- webhook_path = "/webhook"
435
- full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
436
-
437
- logger.info(f"Setting webhook to: {full_webhook_url}")
438
- await asyncio.sleep(2.0)
439
  try:
440
- await ptb_app.bot.set_webhook(
441
- url=full_webhook_url,
442
- allowed_updates=Update.ALL_TYPES,
443
- drop_pending_updates=True
444
- )
445
- webhook_info = await ptb_app.bot.get_webhook_info()
446
- logger.info(f"Webhook set: {webhook_info}")
447
  except Exception as e:
448
- logger.error(f"Failed to set webhook: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
  logger.info("ASGI Lifespan: Startup complete.")
451
- yield
452
 
453
  except Exception as startup_err:
454
- logger.critical(f"Startup error: {startup_err}", exc_info=True)
455
- raise
 
 
 
 
 
456
  finally:
457
  logger.info("ASGI Lifespan: Shutdown sequence initiated...")
458
  if ptb_app:
459
- try:
460
- await ptb_app.stop()
461
- await ptb_app.shutdown()
462
- logger.info("PTB Application shut down gracefully.")
463
- except Exception as shutdown_err:
464
- logger.error(f"Shutdown error: {shutdown_err}")
 
 
465
  logger.info("ASGI Lifespan: Shutdown complete.")
466
 
467
- # --- Flask App Setup ---
468
- flask_core_app = Flask(__name__)
469
 
470
- @flask_core_app.route('/')
471
- def index():
472
  """Basic health check endpoint."""
473
- bot_status = "Unknown"
474
  if ptb_app and ptb_app.bot:
475
- bot_status = f"Running (@{ptb_app.bot.username})"
476
- return f"Telegram Bot Summarizer - Status: {bot_status}"
 
 
 
 
 
 
 
 
 
477
 
478
- @flask_core_app.route('/webhook', methods=['POST'])
479
- async def webhook() -> Response:
480
  """Webhook endpoint called by Telegram."""
481
  if not ptb_app:
482
- return Response('Bot not initialized', status=503)
 
 
 
 
 
 
483
 
484
  try:
485
- update_data = request.get_json()
486
- if not update_data:
487
- return Response('Bad Request', status=400)
488
-
489
- update = Update.de_json(update_data, ptb_app.bot)
 
 
 
 
 
 
490
  await ptb_app.process_update(update)
491
- return Response('ok', status=200)
492
 
 
 
 
493
  except Exception as e:
494
- logger.error(f"Webhook error: {e}")
495
- return Response('Internal Server Error', status=500)
 
 
496
 
497
  # --- Create Starlette ASGI Application ---
498
  app = Starlette(
499
- debug=False,
500
  lifespan=lifespan,
501
  routes=[
502
- Mount("/", app=WSGIMiddleware(flask_core_app))
 
503
  ]
504
  )
505
- logger.info("Starlette ASGI application created.")
506
 
507
- # --- Development Server Execution Block ---
 
 
508
  if __name__ == '__main__':
509
- logger.warning("Running in development mode (Flask server only)")
510
- local_port = int(os.environ.get('PORT', 8080))
511
- flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
 
 
1
+ # main.py (Revised with Starlette-only routes and fixes)
2
  import os
3
  import re
4
  import logging
 
7
  import html
8
  import contextlib
9
  import traceback
10
+ from typing import Optional, Dict, Any
11
 
12
  # --- Frameworks ---
13
+ # Removed Flask imports
14
  from starlette.applications import Starlette
15
+ from starlette.routing import Route # Changed from Mount
16
+ from starlette.responses import PlainTextResponse, JSONResponse, Response # Starlette responses
17
+ from starlette.requests import Request # Starlette request object
18
 
19
  # --- Telegram Bot ---
20
  from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
 
27
  CallbackQueryHandler,
28
  )
29
  from telegram.constants import ParseMode
30
+ from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError
31
  from telegram.request import HTTPXRequest
32
 
33
  # --- Other Libraries ---
34
  import httpx
35
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
36
  import requests
37
  from bs4 import BeautifulSoup
38
+ from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log
39
 
40
  _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
41
  if _apify_token_exists:
42
  from apify_client import ApifyClient
43
+ from apify_client.consts import ActorJobStatus
44
  else:
45
+ ApifyClient = None # type: ignore # Make type checkers happy
46
+
47
 
48
  # --- Logging Setup ---
49
  logging.basicConfig(
50
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
51
+ level=logging.INFO # Changed default to INFO, DEBUG is very verbose
52
  )
53
  logging.getLogger("httpx").setLevel(logging.WARNING)
54
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
 
80
  logger.info("Secret loading attempt finished.")
81
 
82
  # --- Retry Decorator for Bot Operations ---
83
+ # Increased attempts slightly and added logging before sleep
84
+ # Explicitly catch TelegramError subclasses for retries
85
+ @retry(
86
+ stop=stop_after_attempt(4), # Increased slightly
87
+ wait=wait_exponential(multiplier=1, min=2, max=15), # Adjusted wait
88
+ retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), # Specific Telegram errors
89
+ before_sleep=before_sleep_log(logger, logging.WARNING), # Use tenacity logger
90
+ reraise=True # Reraise the exception if all retries fail
91
+ )
92
+ async def retry_bot_operation(func, *args, **kwargs):
93
+ """Wrapper to retry bot operations with exponential backoff."""
94
+ # Note: This is now a wrapper function, not a decorator factory
95
+ # Call it like: await retry_bot_operation(bot.send_message, chat_id=..., text=...)
96
+ try:
97
+ return await func(*args, **kwargs)
98
+ except BadRequest as e:
99
+ # Don't retry on bad requests that are likely permanent (e.g., chat not found)
100
+ # unless it's a common transient issue like message not modified
101
+ if "message is not modified" in str(e).lower():
102
+ logger.warning(f"Ignoring 'message not modified' error: {e}")
103
+ return None # Or indicate success/no action needed
104
+ logger.error(f"BadRequest during bot operation (will not retry unless specific): {e}")
105
+ raise # Reraise non-retryable BadRequests
106
+ except TelegramError as e:
107
+ logger.warning(f"TelegramError during bot operation (will retry): {e}")
108
+ raise # Reraise retryable errors for tenacity
109
+ except Exception as e:
110
+ logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
111
+ raise # Reraise unexpected errors
112
 
113
+ # --- Helper Functions ---
114
  def is_youtube_url(url):
115
  """Checks if the URL is a valid YouTube video or shorts URL."""
116
+ # More robust regex to handle various youtube domain variations and query params
117
+ youtube_regex = re.compile(
118
+ r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/'
119
+ r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?'
120
+ r'([\w-]{11})' # Group 1: Video ID
121
+ r'(?:\S+)?', # Optional non-whitespace characters after ID
122
+ re.IGNORECASE)
123
+ match = youtube_regex.search(url)
124
  logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}")
125
  return bool(match)
126
 
127
  def extract_youtube_id(url):
128
  """Extracts the YouTube video ID from a URL."""
129
+ # Use the same robust regex as is_youtube_url
130
+ youtube_regex = re.compile(
131
+ r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/'
132
+ r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?'
133
+ r'([\w-]{11})' # Group 1: Video ID
134
+ r'(?:\S+)?',
135
+ re.IGNORECASE)
136
+ match = youtube_regex.search(url)
137
  if match:
138
  video_id = match.group(1)
139
  logger.debug(f"Extracted YouTube ID '{video_id}' from URL: {url}")
 
142
  logger.warning(f"Could not extract YouTube ID from URL: {url}")
143
  return None
144
 
145
+ # --- Content Fetching Functions ---
 
 
 
146
 
147
+ # Using httpx for async requests
148
+ async def fetch_url_content(url: str, timeout: int = 20) -> Optional[str]:
149
+ """Fetches content from a URL using httpx asynchronously."""
150
+ headers = {
151
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
152
+ }
153
+ try:
154
+ async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
155
+ response = await client.get(url)
156
+ response.raise_for_status() # Raise an exception for bad status codes
157
+ # Detect encoding, fallback to UTF-8
158
+ response.encoding = response.apparent_encoding or 'utf-8'
159
+ return response.text
160
+ except httpx.HTTPStatusError as e:
161
+ logger.error(f"HTTP error fetching {url}: {e.response.status_code} - {e}")
162
+ except httpx.RequestError as e:
163
+ logger.error(f"Request error fetching {url}: {e}")
164
+ except Exception as e:
165
+ logger.error(f"Unexpected error fetching {url}: {e}", exc_info=True)
166
+ return None
167
+
168
+ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
169
+ """Fetches YouTube transcript using Supadata API."""
170
+ if not api_key: return None
171
+ api_url = f"https://api.supadata.net/youtube/transcript?video_id={video_id}"
172
+ headers = {'X-API-Key': api_key, 'Accept': 'application/json'}
173
+ logger.info(f"Attempting transcript fetch via Supadata for {video_id}")
174
+ try:
175
+ async with httpx.AsyncClient(timeout=30.0) as client:
176
+ response = await client.get(api_url, headers=headers)
177
+ response.raise_for_status()
178
+ data = response.json()
179
+ if data and isinstance(data, list) and data[0].get("text"):
180
+ transcript = " ".join([item["text"] for item in data if "text" in item])
181
+ logger.info(f"Supadata transcript fetched successfully for {video_id} (length: {len(transcript)})")
182
+ return transcript
183
+ else:
184
+ logger.warning(f"Supadata response format unexpected or empty for {video_id}: {data}")
185
+ return None
186
+ except httpx.HTTPStatusError as e:
187
+ logger.error(f"Supadata API HTTP error for {video_id}: {e.response.status_code} - {e}")
188
+ except Exception as e:
189
+ logger.error(f"Error fetching transcript via Supadata for {video_id}: {e}", exc_info=True)
190
+ return None
191
+
192
+ async def get_transcript_via_apify(video_id: str, api_token: str) -> Optional[str]:
193
+ """Fetches YouTube transcript using Apify YouTube Scraper Actor."""
194
+ if not ApifyClient or not api_token: return None
195
+ logger.info(f"Attempting transcript fetch via Apify for {video_id}")
196
+ try:
197
+ client = ApifyClient(api_token)
198
+ actor_call = client.actor("clockworks/youtube-scraper").call(
199
+ run_input={
200
+ "startUrls": [f"https://www.youtube.com/watch?v={video_id}"],
201
+ "maxResultStreams": 0,
202
+ "maxResults": 0,
203
+ "maxResultCommentStreams": 0,
204
+ "proxyConfiguration": {"useApifyProxy": True},
205
+ "subtitles": True, # Explicitly request subtitles/transcript
206
+ "extendOutputFunction": 'async ({ data, item, page, request, customData, Apify }) => { return item; }',
207
+ }
208
+ )
209
+
210
+ # Wait for the actor run to complete and fetch results
211
+ dataset_items = client.dataset(actor_call["defaultDatasetId"]).list_items().items
212
+ if dataset_items:
213
+ # Look for transcript data within the results
214
+ for item in dataset_items:
215
+ if 'subtitles' in item and isinstance(item['subtitles'], list) and len(item['subtitles']) > 0:
216
+ # Combine transcript lines, assuming standard format
217
+ transcript = " ".join(line.get('text', '') for line in item['subtitles'][0].get('lines', []))
218
+ if transcript.strip():
219
+ logger.info(f"Apify transcript fetched successfully for {video_id} (length: {len(transcript)})")
220
+ return transcript.strip()
221
+ logger.warning(f"Apify run completed for {video_id}, but no transcript found in results.")
222
+ else:
223
+ logger.warning(f"Apify run completed for {video_id}, but dataset was empty.")
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error fetching transcript via Apify for {video_id}: {e}", exc_info=True)
227
+ return None
228
+
229
+
230
+ async def get_youtube_transcript(video_id: str, url: str, supadata_key: Optional[str], apify_token: Optional[str]) -> Optional[str]:
231
+ """Tries different methods to get a YouTube transcript."""
232
+ transcript = None
233
+
234
+ # 1. Try Supadata API (if key exists)
235
+ if supadata_key:
236
+ transcript = await get_transcript_via_supadata(video_id, supadata_key)
237
+ if transcript: return transcript
238
+
239
+ # 2. Try youtube-transcript-api (Direct method)
240
+ logger.info(f"Attempting transcript fetch via youtube-transcript-api for {video_id}")
241
+ try:
242
+ # Run in executor to avoid blocking async loop
243
+ transcript_list = await asyncio.to_thread(YouTubeTranscriptApi.get_transcript, video_id)
244
+ transcript = " ".join([item['text'] for item in transcript_list])
245
+ logger.info(f"youtube-transcript-api transcript fetched successfully for {video_id} (length: {len(transcript)})")
246
+ return transcript
247
+ except (TranscriptsDisabled, NoTranscriptFound):
248
+ logger.warning(f"Transcripts disabled or unavailable via youtube-transcript-api for {video_id}.")
249
+ except Exception as e:
250
+ logger.error(f"Error using youtube-transcript-api for {video_id}: {e}")
251
+
252
+ # 3. Try Apify (if token exists and other methods failed)
253
+ if not transcript and apify_token:
254
+ transcript = await get_transcript_via_apify(video_id, apify_token)
255
+ if transcript: return transcript
256
+
257
+ logger.warning(f"Failed to retrieve transcript for YouTube video {video_id} using all available methods.")
258
+ return None
259
+
260
+ async def get_website_content_via_requests(url: str) -> Optional[str]:
261
+ """Fetches and extracts main text content from a website using BeautifulSoup."""
262
+ logger.info(f"Attempting website scrape via requests/BeautifulSoup for: {url}")
263
+ html_content = await fetch_url_content(url)
264
+ if not html_content:
265
+ return None
266
+
267
+ try:
268
+ # Run BeautifulSoup parsing in executor thread
269
+ def parse_html(content):
270
+ soup = BeautifulSoup(content, 'html.parser')
271
+ # Remove script and style elements
272
+ for script_or_style in soup(["script", "style", "nav", "footer", "aside"]):
273
+ script_or_style.decompose()
274
+ # Get text, strip whitespace, join lines
275
+ text = soup.get_text(separator='\n', strip=True)
276
+ lines = (line.strip() for line in text.splitlines())
277
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
278
+ text = '\n'.join(chunk for chunk in chunks if chunk)
279
+ return text
280
+
281
+ text_content = await asyncio.to_thread(parse_html, html_content)
282
+
283
+ if text_content and len(text_content) > 100: # Basic check for meaningful content
284
+ logger.info(f"Successfully scraped content via requests/BeautifulSoup for {url} (length: {len(text_content)})")
285
+ return text_content
286
+ else:
287
+ logger.warning(f"Scraping via requests/BeautifulSoup for {url} yielded minimal content (length: {len(text_content) if text_content else 0}).")
288
+ return None
289
+ except Exception as e:
290
+ logger.error(f"Error parsing website content with BeautifulSoup for {url}: {e}", exc_info=True)
291
+ return None
292
+
293
+ async def get_website_content_via_urltotext_api(url: str, api_key: str) -> Optional[str]:
294
+ """Fetches website content using the UrlToText API."""
295
+ if not api_key: return None
296
+ api_endpoint = "https://api.urltotext.ai/text"
297
+ headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
298
+ payload = {"url": url, "text_only": True} # Requesting only text content
299
+ logger.info(f"Attempting website content fetch via UrlToText API for: {url}")
300
+
301
+ try:
302
+ async with httpx.AsyncClient(timeout=45.0) as client: # Increased timeout
303
+ response = await client.post(api_endpoint, headers=headers, json=payload)
304
+ response.raise_for_status()
305
+ data = response.json()
306
+ if "text" in data and data["text"]:
307
+ content = data["text"]
308
+ logger.info(f"Successfully fetched content via UrlToText API for {url} (length: {len(content)})")
309
+ return content
310
+ else:
311
+ logger.warning(f"UrlToText API response did not contain text for {url}. Response: {data}")
312
+ return None
313
+ except httpx.HTTPStatusError as e:
314
+ logger.error(f"UrlToText API HTTP error for {url}: {e.response.status_code} - {e}")
315
+ except Exception as e:
316
+ logger.error(f"Error fetching content via UrlToText API for {url}: {e}", exc_info=True)
317
+ return None
318
+
319
+ # --- Summarization Function ---
320
+ async def generate_summary(content: str, summary_type: str, api_key: Optional[str]) -> str:
321
+ """Generates a summary using OpenRouter API."""
322
+ if not api_key:
323
+ return "Error: OpenRouter API key is not configured."
324
+ if not content:
325
+ return "Error: No content provided to summarize."
326
+
327
+ # Basic check for content length
328
+ if len(content) < 50:
329
+ return "The provided content is too short to summarize effectively."
330
+
331
+ # Truncate content if too long (adjust limit as needed based on model context window)
332
+ # This limit depends heavily on the chosen model. Claude Sonnet 3.5 has 200k tokens.
333
+ # Let's aim for roughly 100k characters as a safe-ish limit for many models.
334
+ max_chars = 100000
335
+ if len(content) > max_chars:
336
+ logger.warning(f"Content length ({len(content)}) exceeds max_chars ({max_chars}), truncating.")
337
+ content = content[:max_chars]
338
+
339
+ prompt_template = """
340
+ Please summarize the following text.
341
+ Provide the summary in {format_style} format.
342
+
343
+ Text to summarize:
344
+ ---
345
+ {text}
346
+ ---
347
+
348
+ Summary ({format_style}):
349
+ """
350
+ format_style = "a concise paragraph" if summary_type == "paragraph" else "bullet points (using * or - for each point)"
351
+
352
+ prompt = prompt_template.format(text=content, format_style=format_style)
353
+
354
+ # Recommended model: claude-3.5-sonnet-20240620, but allow others
355
+ model_to_use = os.environ.get("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet") # Default to Claude 3.5 Sonnet
356
+
357
+ logger.info(f"Sending request to OpenRouter (Model: {model_to_use}) for {summary_type} summary.")
358
+
359
+ try:
360
+ async with httpx.AsyncClient(timeout=120.0) as client: # Increased timeout for generation
361
+ response = await client.post(
362
+ url="https://openrouter.ai/api/v1/chat/completions",
363
+ headers={
364
+ "Authorization": f"Bearer {api_key}",
365
+ "Content-Type": "application/json"
366
+ },
367
+ json={
368
+ "model": model_to_use,
369
+ "messages": [{"role": "user", "content": prompt}],
370
+ "max_tokens": 1024, # Adjust as needed
371
+ },
372
+ )
373
+ response.raise_for_status()
374
+ data = response.json()
375
+
376
+ if data.get("choices") and len(data["choices"]) > 0:
377
+ summary = data["choices"][0].get("message", {}).get("content", "").strip()
378
+ if summary:
379
+ logger.info(f"Summary generated successfully (length: {len(summary)})")
380
+ # Simple Markdown sanitization (escape potential conflicts)
381
+ summary = summary.replace('_', r'\_').replace('*', r'\*').replace('[', r'\[').replace('`', r'\`')
382
+ return summary
383
+ else:
384
+ logger.error("OpenRouter response successful, but summary content is empty.")
385
+ return "Sorry, the AI generated an empty summary. Please try again."
386
+ else:
387
+ logger.error(f"OpenRouter response format unexpected: {data}")
388
+ return "Sorry, I received an unexpected response from the summarization service."
389
+
390
+ except httpx.HTTPStatusError as e:
391
+ error_body = ""
392
+ try:
393
+ error_body = e.response.text
394
+ except Exception:
395
+ pass # Ignore if reading body fails
396
+ logger.error(f"OpenRouter API HTTP error: {e.response.status_code} - {e}. Response body: {error_body}")
397
+ return f"Sorry, there was an error communicating with the summarization service (HTTP {e.response.status_code})."
398
+ except Exception as e:
399
+ logger.error(f"Error generating summary via OpenRouter: {e}", exc_info=True)
400
+ return "Sorry, an unexpected error occurred while generating the summary."
401
+
402
+
403
+ # --- Background Task Processing ---
404
  async def process_summary_task(
405
  user_id: int,
406
  chat_id: int,
407
+ message_id_to_edit: Optional[int], # Can be None if original message couldn't be edited
408
  url: str,
409
  summary_type: str,
410
+ bot_token: str
411
  ) -> None:
412
  """Handles the actual fetching and summarization in a background task."""
413
+ task_id = f"{user_id}-{message_id_to_edit or 'new'}"
414
  logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
415
+
416
  # Create a new bot instance for this task
417
+ # Use longer timeouts for background tasks which might involve heavy network I/O
418
+ custom_request = HTTPXRequest(
419
+ connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0, http_version="1.1"
420
+ )
421
+ bot = Bot(token=bot_token, request=custom_request)
422
+
423
+ content = None
424
+ user_feedback_message = None
425
+ success = False
426
+ final_summary = ""
427
+ status_message_id = message_id_to_edit # Start assuming we can edit the original
428
+
429
  try:
430
+ # --- Inform User Processing Has Started (Edit original or send new) ---
431
+ processing_message_text = f"⏳ Working on your '{summary_type}' summary for:\n`{url}`\n\n_(Fetching & summarizing...)_"
432
+
433
+ if status_message_id:
 
 
 
434
  try:
435
+ await retry_bot_operation(
436
+ bot.edit_message_text,
 
 
 
 
 
 
 
 
437
  chat_id=chat_id,
438
+ message_id=status_message_id,
439
+ text=processing_message_text,
440
+ parse_mode=ParseMode.MARKDOWN,
441
+ reply_markup=None # Remove buttons
442
  )
443
+ logger.debug(f"[Task {task_id}] Successfully edited message {status_message_id} to 'Processing'")
444
+ except Exception as e:
445
+ logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Will send a new status message.")
446
+ status_message_id = None # Flag to send a new message
447
+
448
+ if not status_message_id: # If editing failed or wasn't possible
449
+ try:
450
+ status_message = await retry_bot_operation(
451
+ bot.send_message,
452
+ chat_id=chat_id,
453
+ text=processing_message_text,
454
+ parse_mode=ParseMode.MARKDOWN
455
+ )
456
+ status_message_id = status_message.message_id
457
+ logger.debug(f"[Task {task_id}] Sent new status message {status_message_id}")
458
+ except Exception as e:
459
+ logger.error(f"[Task {task_id}] Failed to send new status message: {e}")
460
+ # Cannot proceed without informing user
461
+ raise RuntimeError("Failed to send initial status message") from e
462
 
463
  # --- Main Content Fetching and Summarization ---
 
 
 
 
464
  try:
465
+ await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
 
 
 
 
 
466
 
467
  # --- Determine Content Type and Fetch ---
468
  is_yt = is_youtube_url(url)
 
472
  video_id = extract_youtube_id(url)
473
  if video_id:
474
  logger.info(f"[Task {task_id}] Fetching YouTube transcript for {video_id}")
475
+ content = await get_youtube_transcript(video_id, url, SUPADATA_API_KEY, APIFY_API_TOKEN)
 
 
 
 
 
476
  if not content:
477
+ user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video. It might be disabled or unavailable."
478
+ else:
479
+ user_feedback_message = "⚠️ Couldn't extract a valid YouTube video ID from the link."
480
  else:
481
  logger.info(f"[Task {task_id}] Attempting website scrape for: {url}")
482
  content = await get_website_content_via_requests(url)
483
  if not content and URLTOTEXT_API_KEY:
484
+ logger.info(f"[Task {task_id}] Basic scrape failed or insufficient, trying UrlToText API...")
485
+ await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
486
  content = await get_website_content_via_urltotext_api(url, URLTOTEXT_API_KEY)
487
+
488
+ if not content:
489
+ user_feedback_message = "⚠️ Sorry, I couldn't fetch or extract meaningful content from that website."
490
 
491
  # --- Generate Summary if Content Was Fetched ---
492
  if content:
493
+ logger.info(f"[Task {task_id}] Content fetched (length: {len(content)}). Generating '{summary_type}' summary.")
494
+ await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
 
 
495
 
496
+ final_summary = await generate_summary(content, summary_type, OPENROUTER_API_KEY)
497
+
498
+ if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
499
+ user_feedback_message = f"⚠️ {final_summary}" # Use the error from generate_summary
500
  else:
501
+ success = True # Summary generated successfully
502
+
503
+ # If content fetching failed initially, user_feedback_message is already set
 
 
 
 
 
 
 
 
504
 
505
  except Exception as e:
506
+ logger.error(f"[Task {task_id}] Error during content fetching or summarization: {e}", exc_info=True)
507
  user_feedback_message = "❌ An unexpected error occurred while processing your request."
508
 
509
+ # --- Send Final Result or Error ---
510
+ if success and final_summary:
511
+ # Split summary if it's too long for one message
512
+ max_length = 4096
513
+ summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
514
+
515
+ # Send the first part (or the only part)
516
+ await retry_bot_operation(
517
+ bot.send_message,
518
+ chat_id=chat_id,
519
+ text=summary_parts[0],
520
+ parse_mode=ParseMode.MARKDOWN,
521
+ link_preview_options={'is_disabled': True} # Changed to dict format for PTB v21+
522
+ )
523
+ # Send subsequent parts if any
524
+ for part in summary_parts[1:]:
525
+ await asyncio.sleep(0.5) # Small delay between parts
526
+ await retry_bot_operation(
527
+ bot.send_message,
528
+ chat_id=chat_id,
529
+ text=part,
530
+ parse_mode=ParseMode.MARKDOWN,
531
+ link_preview_options={'is_disabled': True}
532
+ )
533
+ logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
534
+
535
+ elif user_feedback_message: # Handle errors (either from fetching or summarization)
536
+ logger.warning(f"[Task {task_id}] Sending feedback/error message: {user_feedback_message}")
537
+ await retry_bot_operation(
538
+ bot.send_message,
539
+ chat_id=chat_id,
540
+ text=user_feedback_message,
541
+ link_preview_options={'is_disabled': True}
542
+ )
543
+
544
+ else: # Should not happen, but safety net
545
+ logger.error(f"[Task {task_id}] Reached end of task without success or specific error message.")
546
+ await retry_bot_operation(
547
+ bot.send_message,
548
+ chat_id=chat_id,
549
+ text="❓ Something went wrong, but no specific error was identified.",
550
+ link_preview_options={'is_disabled': True}
551
+ )
552
+
553
 
554
  except Exception as e:
555
+ logger.critical(f"[Task {task_id}] Critical error within task processing: {e}", exc_info=True)
556
  try:
557
+ await retry_bot_operation(
558
+ bot.send_message,
559
  chat_id=chat_id,
560
+ text="❌ A critical internal error occurred. Please report this if it persists."
561
  )
562
  except Exception:
563
+ logger.exception(f"[Task {task_id}] Failed even to send critical error message.")
564
  finally:
565
+ # --- Clean up Status Message ---
566
+ if status_message_id:
567
+ try:
568
+ await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=status_message_id)
569
+ logger.debug(f"[Task {task_id}] Deleted status message {status_message_id}")
570
+ except Exception as e:
571
+ logger.warning(f"[Task {task_id}] Failed to delete status message {status_message_id}: {e}")
572
+
 
 
 
 
 
 
 
 
573
  # Ensure bot session is closed
574
+ if bot and bot.session:
575
+ try:
576
+ await bot.shutdown() # Use bot.shutdown() which includes closing the session
577
+ logger.debug(f"[Task {task_id}] Background bot instance shut down.")
578
+ except Exception as e:
579
+ logger.warning(f"[Task {task_id}] Error shutting down background bot instance: {e}")
580
 
581
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
582
 
583
+
584
  # --- Telegram Bot Handlers ---
585
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
586
  """Handles the /start command."""
587
  user = update.effective_user
588
+ if not user or not update.message: return
589
  logger.info(f"User {user.id} initiated /start.")
590
+ mention = user.mention_html() # mention_html is safer
591
  start_message = (
592
  f"👋 Hello {mention}!\n\n"
593
  "I can summarise YouTube videos or web articles for you.\n\n"
 
599
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
600
  """Handles the /help command."""
601
  user = update.effective_user
602
+ if not user or not update.message: return
603
+ logger.info(f"User {user.id} requested /help.")
604
  help_text = (
605
  "**How to Use Me:**\n"
606
  "1. Send me a direct link (URL) to a YouTube video or a web article.\n"
 
609
  "4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
610
  "**Important Notes:**\n"
611
  "- **YouTube:** Getting transcripts can sometimes fail if they are disabled or unavailable.\n"
612
+ "- **Websites:** I try my best, but complex or JavaScript-heavy sites might not work perfectly.\n"
613
+ "- **AI Summaries:** The AI aims for accuracy but may occasionally miss nuances or make errors.\n"
614
+ "- **Length Limits:** Very long videos or articles might be truncated before summarization.\n\n"
615
  "Just send a link to get started!"
616
  )
617
+ # Use reply_markdown_v2 for safer Markdown parsing if needed, but MARKDOWN should suffice here
618
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
619
 
620
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
624
  user = update.effective_user
625
  if not user: return
626
 
627
+ # Improved URL regex (simplified for matching, not validation)
628
+ url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
629
  match = re.search(url_pattern, message_text)
630
 
631
  if match:
632
  url = match.group(0)
633
+ # Basic cleanup: remove trailing characters like periods or commas if they likely aren't part of URL
634
+ url = re.sub(r'[.,!?)\]>]+$', '', url)
635
+ logger.info(f"User {user.id} sent potential URL: {url}")
636
+
637
+ # Store URL and the original message ID for potential editing
638
  context.user_data['url_to_summarize'] = url
639
+ context.user_data['original_message_id'] = update.message.message_id # Store for potential edit
640
 
641
  keyboard = [
642
  [
643
+ InlineKeyboardButton("📜 Paragraph", callback_data="paragraph"), # Shortened labels
644
  InlineKeyboardButton("🔹 Bullet Points", callback_data="points")
645
  ]
646
  ]
 
652
  link_preview_options={'is_disabled': True}
653
  )
654
  elif not message_text.startswith('/'):
655
+ # Avoid replying to non-URL, non-command text to prevent noise
656
+ logger.debug(f"User {user.id} sent non-URL, non-command text: '{message_text[:50]}...'")
657
+ # Optional: Send a hint if it looks like they *tried* to send a URL
658
+ if "http" in message_text or "www." in message_text:
659
+ await update.message.reply_text("Hmm, that looks like it might be a link, but please ensure it starts with `http://` or `https://` and is a valid URL.")
660
+ # else: do nothing
661
 
662
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
663
  """Handles button presses for summary type selection."""
664
  query = update.callback_query
665
+ if not query or not query.message or not query.from_user:
666
+ logger.warning("Callback query received without essential data.")
667
+ if query: await query.answer() # Try to answer anyway
 
 
668
  return
669
 
 
 
670
  user = query.from_user
671
  summary_type = query.data
 
672
  query_id = query.id
673
 
674
+ # --- Acknowledge button press ---
675
+ # This is where the original error occurred. Trying this early.
676
+ try:
677
+ await query.answer()
678
+ logger.debug(f"Acknowledged callback query {query_id} from user {user.id}")
679
+ except Exception as e:
680
+ # Log the error but try to continue, as acknowledging isn't strictly critical
681
+ logger.error(f"Error answering callback query {query_id} from user {user.id}: {e}", exc_info=True)
682
+ # The original NetworkError might still happen here in problematic environments.
683
+ # If it persists, it points to deeper issues with httpx/anyio/uvloop/PTB interaction.
684
+
685
+ # --- Retrieve URL and Original Message ID ---
686
+ url = context.user_data.get('url_to_summarize')
687
+ # Use the message ID from the callback query's message - this IS the message with buttons
688
+ message_id_to_edit = query.message.message_id
689
+
690
+ logger.info(f"User {user.id} chose summary type '{summary_type}' for URL associated with message {message_id_to_edit}")
691
 
692
  if not url:
693
+ logger.warning(f"No URL found in user_data for user {user.id} (callback query {query_id}). Editing message.")
694
  try:
695
+ await query.edit_message_text(text="⚠️ Oops! I couldn't find the link associated with this request. Please send the link again.")
696
  except Exception as e:
697
+ logger.error(f"Failed to edit message to show 'URL not found' error: {e}")
698
  return
699
 
700
+ # --- Clear user_data and schedule task ---
701
  context.user_data.pop('url_to_summarize', None)
702
+ context.user_data.pop('original_message_id', None) # Clear stored ID
703
 
704
+ if not TELEGRAM_TOKEN:
705
+ logger.critical("TELEGRAM_TOKEN is missing, cannot start background task!")
706
+ try:
707
+ await query.edit_message_text(text="❌ Internal configuration error. Cannot process request.")
708
+ except Exception: pass
709
+ return
710
+
711
+ logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, message {message_id_to_edit}, type {summary_type}")
712
  asyncio.create_task(
713
  process_summary_task(
714
  user_id=user.id,
715
  chat_id=query.message.chat_id,
716
+ message_id_to_edit=message_id_to_edit, # Pass the ID of the message with buttons
717
  url=url,
718
  summary_type=summary_type,
719
+ bot_token=TELEGRAM_TOKEN # Pass token
720
  ),
721
+ name=f"SummaryTask-{user.id}-{message_id_to_edit}"
722
  )
723
 
724
+
725
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
726
  """Log Errors caused by Updates or background tasks."""
727
+ logger.error("Exception while handling an update:", exc_info=context.error)
728
+
729
+ # Optionally, inform the user about backend errors if appropriate
730
+ # Be careful not to spam users if the error is persistent
731
+ # if isinstance(context.error, (NetworkError, TimedOut)):
732
+ # # Potentially inform user about temporary issues
733
+ # pass
734
+ # elif isinstance(context.error, TelegramError):
735
+ # # Handle specific Telegram API errors if needed
736
+ # pass
737
+
738
 
739
  # --- Bot Setup Function ---
740
  async def setup_bot_config() -> Application:
 
743
  if not TELEGRAM_TOKEN:
744
  raise ValueError("TELEGRAM_TOKEN environment variable not set.")
745
 
746
+ # Configure httpx client for PTB (timeouts are important)
747
  custom_request = HTTPXRequest(
748
  connect_timeout=10.0,
749
+ read_timeout=30.0, # Read timeout for API calls like getMe, getUpdates etc.
750
  write_timeout=30.0,
751
+ pool_timeout=60.0, # Allow keeping connections open longer
752
+ http_version="1.1" # Stick to 1.1 for wider compatibility
753
  )
754
 
755
  application = (
756
  Application.builder()
757
  .token(TELEGRAM_TOKEN)
758
  .request(custom_request)
759
+ # Consider connection_pool_size if expecting high concurrency
760
+ # .connection_pool_size(10)
761
  .build()
762
  )
763
 
764
+ # Add handlers
765
  application.add_handler(CommandHandler("start", start))
766
  application.add_handler(CommandHandler("help", help_command))
767
+ # Use TEXT filter combined with URL entity/filter for better targeting?
768
+ # For simplicity, keeping the regex check inside the handler for now.
769
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
770
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
771
+
772
+ # Add error handler
773
  application.add_error_handler(error_handler)
774
 
775
  logger.info("Telegram application handlers configured.")
776
  return application
777
 
778
+ # --- ASGI Lifespan Context Manager (Mostly Unchanged) ---
779
  @contextlib.asynccontextmanager
780
  async def lifespan(app: Starlette):
781
  """Handles PTB startup and shutdown during ASGI lifespan."""
782
  global ptb_app
783
  logger.info("ASGI Lifespan: Startup sequence initiated...")
784
 
785
+ if not TELEGRAM_TOKEN:
786
+ logger.critical("TELEGRAM_TOKEN is not set. Bot cannot start.")
787
+ raise RuntimeError("Telegram token missing.")
788
+
789
+ bot_info_text = "Bot info not available yet."
790
  try:
791
  ptb_app = await setup_bot_config()
792
+ await ptb_app.initialize() # Initialize application resources
 
793
 
794
  bot_info = await ptb_app.bot.get_me()
795
+ bot_info_text = f"@{bot_info.username} (ID: {bot_info.id})"
796
+ logger.info(f"Bot initialized: {bot_info_text}")
797
 
798
+ # Check for existing webhook and potentially delete it before setting a new one
799
+ current_webhook_info = await ptb_app.bot.get_webhook_info()
800
+ if current_webhook_info and current_webhook_info.url:
801
+ logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete it.")
 
 
 
 
 
802
  try:
803
+ await ptb_app.bot.delete_webhook(drop_pending_updates=True)
804
+ logger.info("Existing webhook deleted successfully.")
 
 
 
 
 
805
  except Exception as e:
806
+ logger.warning(f"Could not delete existing webhook: {e}")
807
+ await asyncio.sleep(1) # Short pause before setting new one
808
+
809
+
810
+ # Determine Webhook URL from Hugging Face environment variables
811
+ space_host = os.environ.get("SPACE_HOST")
812
+ webhook_path = "/webhook" # Keep it simple
813
+ full_webhook_url = None
814
+
815
+ if space_host:
816
+ # Ensure it starts with https://
817
+ if not space_host.startswith("https://"):
818
+ # HF usually provides full host like 'user-repo.hf.space'
819
+ # Prepend https:// if missing
820
+ if not space_host.startswith("http"):
821
+ full_webhook_url = f"https://{space_host.rstrip('/')}{webhook_path}"
822
+ else: # If it starts with http://, replace it (HF uses https)
823
+ full_webhook_url = f"https://{space_host.split('://')[1].rstrip('/')}{webhook_path}"
824
+
825
+ if full_webhook_url:
826
+ logger.info(f"Attempting to set webhook to: {full_webhook_url}")
827
+ await asyncio.sleep(2.0) # Give network time to settle
828
+ try:
829
+ await ptb_app.bot.set_webhook(
830
+ url=full_webhook_url,
831
+ allowed_updates=Update.ALL_TYPES, # Or specify types like ['message', 'callback_query']
832
+ drop_pending_updates=True,
833
+ secret_token=os.environ.get("WEBHOOK_SECRET") # Optional: Add a secret token for security
834
+ )
835
+ webhook_info = await ptb_app.bot.get_webhook_info()
836
+ logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Pending={webhook_info.pending_update_count}")
837
+ # Start PTB's internal runners *after* setting the webhook
838
+ await ptb_app.start()
839
+ logger.info("PTB Application started (webhook mode). Ready for updates.")
840
+ except Exception as e:
841
+ logger.error(f"FATAL: Failed to set webhook to {full_webhook_url}: {e}", exc_info=True)
842
+ # Decide how to handle failure: maybe try polling or just exit?
843
+ # For HF Spaces webhook is preferred. Raising error might be best.
844
+ raise RuntimeError(f"Failed to set webhook: {e}") from e
845
+ else:
846
+ logger.warning("Could not construct valid HTTPS webhook URL from SPACE_HOST.")
847
+ # Fallback or error needed here? For HF, webhook is essential.
848
+ raise RuntimeError("Webhook URL could not be determined.")
849
+ else:
850
+ logger.warning("SPACE_HOST environment variable not found. Cannot set webhook.")
851
+ # Maybe fall back to polling for local dev? Not ideal for HF.
852
+ # logger.info("Starting PTB in polling mode (no SPACE_HOST found).")
853
+ # await ptb_app.start() # Start runners
854
+ # application.run_polling(drop_pending_updates=True) # Start polling - BLOCKING! Not suitable for ASGI app.
855
+ raise RuntimeError("SPACE_HOST env var missing, cannot run in webhook mode.")
856
+
857
 
858
  logger.info("ASGI Lifespan: Startup complete.")
859
+ yield # Application runs here
860
 
861
  except Exception as startup_err:
862
+ logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
863
+ # Ensure cleanup happens even if startup fails partially
864
+ if ptb_app and ptb_app.running:
865
+ await ptb_app.stop()
866
+ if ptb_app:
867
+ await ptb_app.shutdown()
868
+ raise # Reraise the error to stop the ASGI server
869
  finally:
870
  logger.info("ASGI Lifespan: Shutdown sequence initiated...")
871
  if ptb_app:
872
+ if ptb_app.running:
873
+ logger.info("Stopping PTB application...")
874
+ await ptb_app.stop()
875
+ logger.info("Shutting down PTB application...")
876
+ await ptb_app.shutdown()
877
+ logger.info("PTB Application shut down gracefully.")
878
+ else:
879
+ logger.info("PTB application was not initialized or startup failed.")
880
  logger.info("ASGI Lifespan: Shutdown complete.")
881
 
 
 
882
 
883
+ # --- Starlette Route Handlers ---
884
+ async def health_check(request: Request) -> PlainTextResponse:
885
  """Basic health check endpoint."""
886
+ bot_status = "Not Initialized"
887
  if ptb_app and ptb_app.bot:
888
+ try:
889
+ # Perform a lightweight check like get_me again, or use a flag
890
+ if ptb_app.running:
891
+ bot_info = await ptb_app.bot.get_me() # Cached usually
892
+ bot_status = f"Running (@{bot_info.username})"
893
+ else:
894
+ bot_status = "Initialized but not running"
895
+ except Exception as e:
896
+ bot_status = f"Error checking status: {e}"
897
+
898
+ return PlainTextResponse(f"Telegram Bot Summarizer - Status: {bot_status}")
899
 
900
+ async def telegram_webhook(request: Request) -> Response:
 
901
  """Webhook endpoint called by Telegram."""
902
  if not ptb_app:
903
+ logger.error("Webhook received but PTB application not initialized.")
904
+ return PlainTextResponse('Bot not initialized', status_code=503)
905
+ if not ptb_app.running:
906
+ logger.warning("Webhook received but PTB application not running.")
907
+ # Maybe return 200 OK anyway so Telegram doesn't retry too much?
908
+ # Or 503 Service Unavailable
909
+ return PlainTextResponse('Bot initialized but not running', status_code=503)
910
 
911
  try:
912
+ # Check secret token if configured
913
+ secret_token = os.environ.get("WEBHOOK_SECRET")
914
+ if secret_token:
915
+ if request.headers.get("X-Telegram-Bot-Api-Secret-Token") != secret_token:
916
+ logger.warning("Webhook received with invalid secret token.")
917
+ return Response(status_code=403) # Forbidden
918
+
919
+ # Process the update
920
+ update_data = await request.json()
921
+ update = Update.de_json(data=update_data, bot=ptb_app.bot)
922
+ logger.debug(f"Processing update_id: {update.update_id}")
923
  await ptb_app.process_update(update)
924
+ return Response(status_code=200) # OK
925
 
926
+ except json.JSONDecodeError:
927
+ logger.error("Webhook received invalid JSON.")
928
+ return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
929
  except Exception as e:
930
+ logger.error(f"Error processing webhook update: {e}", exc_info=True)
931
+ # Return 200 OK even on errors to prevent Telegram retries if the error is in processing logic
932
+ # Alternatively return 500 if it's a server-side issue. 200 is often safer for webhooks.
933
+ return Response(status_code=200) # OK despite error
934
 
935
  # --- Create Starlette ASGI Application ---
936
  app = Starlette(
937
+ debug=False, # Set debug based on environment if needed
938
  lifespan=lifespan,
939
  routes=[
940
+ Route("/", endpoint=health_check, methods=["GET"]),
941
+ Route("/webhook", endpoint=telegram_webhook, methods=["POST"]),
942
  ]
943
  )
944
+ logger.info("Starlette ASGI application created with native routes.")
945
 
946
+ # --- Development Server Execution Block (Optional) ---
947
+ # This block is generally NOT used when deploying with Gunicorn/Uvicorn
948
+ # Keep it for potential local testing without gunicorn
949
  if __name__ == '__main__':
950
+ import uvicorn
951
+ logger.warning("Running in development mode using Uvicorn directly (not for production)")
952
+ local_port = int(os.environ.get('PORT', 8080)) # Use PORT if set, else 8080
953
+ uvicorn.run(app, host='0.0.0.0', port=local_port, log_level="info")