Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# main.py (Revised:
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
@@ -24,11 +24,11 @@ from telegram.ext import (
|
|
24 |
CallbackQueryHandler,
|
25 |
)
|
26 |
from telegram.constants import ParseMode
|
27 |
-
from telegram.error import NetworkError, RetryAfter # Import
|
28 |
from telegram.request import HTTPXRequest # Import the request class
|
29 |
|
30 |
# --- Other Libraries ---
|
31 |
-
|
32 |
from youtube_transcript_api import YouTubeTranscriptApi
|
33 |
import requests
|
34 |
from bs4 import BeautifulSoup
|
@@ -43,6 +43,7 @@ logging.basicConfig(
|
|
43 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
44 |
level=logging.DEBUG
|
45 |
)
|
|
|
46 |
logging.getLogger("httpx").setLevel(logging.WARNING)
|
47 |
if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
|
48 |
logging.getLogger("telegram.ext").setLevel(logging.INFO)
|
@@ -51,6 +52,7 @@ logging.getLogger("urllib3").setLevel(logging.INFO)
|
|
51 |
logging.getLogger('gunicorn.error').setLevel(logging.INFO)
|
52 |
logging.getLogger('uvicorn').setLevel(logging.INFO)
|
53 |
logging.getLogger('starlette').setLevel(logging.INFO)
|
|
|
54 |
logger = logging.getLogger(__name__)
|
55 |
logger.info("Logging configured.")
|
56 |
|
@@ -60,7 +62,7 @@ ptb_app: Application | None = None
|
|
60 |
# --- Environment Variable Loading ---
|
61 |
logger.info("Attempting to load secrets...")
|
62 |
def get_secret(secret_name):
|
63 |
-
logger.debug(f"Attempting to read secret: {secret_name}")
|
64 |
value = os.environ.get(secret_name)
|
65 |
if value: logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})")
|
66 |
else: logger.warning(f"Secret '{secret_name}': Not Found")
|
@@ -79,7 +81,6 @@ logger.info("Secret loading attempt finished.")
|
|
79 |
# get_transcript_via_supadata, get_transcript_via_apify,
|
80 |
# get_youtube_transcript, get_website_content_via_requests,
|
81 |
# get_website_content_via_urltotext_api, generate_summary)
|
82 |
-
# Ensure the generate_summary has the updated prompts from previous response
|
83 |
|
84 |
# Helper Functions
|
85 |
def is_youtube_url(url):
|
@@ -111,7 +112,8 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
|
|
111 |
params = {"videoId": video_id, "format": "text"}
|
112 |
headers = {"X-API-Key": api_key}
|
113 |
try:
|
114 |
-
|
|
|
115 |
response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
|
116 |
logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
|
117 |
if response.status_code == 200:
|
@@ -253,268 +255,625 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
|
|
253 |
except Exception as e:
|
254 |
logger.warning(f"[Primary YT] Error via library: {type(e).__name__} - {e}")
|
255 |
if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e): logger.warning("[Primary YT] IP likely blocked.")
|
256 |
-
elif "No transcript found" in str(e): logger.warning("[Primary YT] No transcript in languages.")
|
257 |
-
elif "TranscriptsDisabled" in str(e) or "disabled" in str(e): logger.warning("[Primary YT] Transcripts disabled.")
|
258 |
-
transcript_text = None
|
259 |
|
260 |
if transcript_text is None: # Fallback 1: Supadata
|
261 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
262 |
if supadata_key:
|
263 |
transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
|
264 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata. Length: {len(transcript_text)}"); return transcript_text
|
265 |
-
else: logger.warning("[Fallback YT 1] Supadata failed or no content.")
|
266 |
-
else: logger.warning("[Fallback YT 1] Supadata key not available.")
|
267 |
|
268 |
if transcript_text is None: # Fallback 2: Apify
|
269 |
logger.info("[Fallback YT 2] Trying Apify API...")
|
270 |
if apify_token:
|
271 |
transcript_text = await get_transcript_via_apify(video_url, apify_token)
|
272 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify. Length: {len(transcript_text)}"); return transcript_text
|
273 |
-
else: logger.warning("[Fallback YT 2] Apify failed or no content.")
|
274 |
-
else: logger.warning("[Fallback YT 2] Apify token not available.")
|
275 |
|
276 |
-
if transcript_text is None: logger.error(f"All methods failed for video ID: {video_id}")
|
277 |
return transcript_text
|
278 |
|
279 |
# Website Content via Requests/BS4
|
280 |
async def get_website_content_via_requests(url):
|
281 |
"""Attempts to scrape website content using requests/BeautifulSoup."""
|
282 |
-
if not url: logger.error("[Web Scraper - Requests/BS4] no URL"); return None
|
283 |
-
logger.info(f"[Web Scraper - Requests/BS4]
|
284 |
try:
|
285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
|
287 |
-
response.raise_for_status()
|
288 |
logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
|
|
|
289 |
content_type = response.headers.get('content-type', '').lower()
|
290 |
if 'html' not in content_type:
|
291 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML: {content_type}.
|
292 |
-
|
|
|
|
|
|
|
|
|
293 |
return None
|
|
|
294 |
soup = BeautifulSoup(response.text, 'html.parser')
|
295 |
-
|
296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
target_element = main_content if main_content else soup.body
|
298 |
-
if not target_element:
|
|
|
|
|
|
|
|
|
299 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
300 |
-
text = "\n".join(lines)
|
301 |
-
|
302 |
-
|
303 |
-
|
|
|
|
|
|
|
|
|
|
|
304 |
return text
|
305 |
-
|
306 |
-
except requests.exceptions.
|
307 |
-
except requests.exceptions.
|
308 |
-
except
|
|
|
|
|
309 |
|
310 |
# Website Content via URLToText API
|
311 |
async def get_website_content_via_urltotext_api(url: str, api_key: str):
|
312 |
"""Fetches website content using the URLToText API."""
|
313 |
-
if not url: logger.error("[Web Scraper - URLToText API] no URL"); return None
|
314 |
-
if not api_key: logger.error("[Web Scraper - URLToText API] API key missing."); return None
|
315 |
-
logger.info(f"[Web Scraper - URLToText API] Attempting fetch: {url}")
|
316 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
|
319 |
try:
|
320 |
-
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=
|
321 |
-
logger.debug(f"[Web Scraper - URLToText API]
|
322 |
if response.status_code == 200:
|
323 |
try:
|
324 |
data = response.json()
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
# DeepSeek Summary Function (with updated prompts)
|
343 |
async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
344 |
"""Generates summary using DeepSeek via OpenRouter API."""
|
345 |
logger.info(f"Generating '{summary_type}' summary. Input length: {len(text)}")
|
346 |
-
if not api_key: logger.error("OpenRouter API key missing."); return "Error: AI
|
347 |
-
if not text: logger.warning("generate_summary called with empty text."); return "Error: No content to summarize."
|
348 |
-
|
|
|
|
|
|
|
|
|
349 |
|
350 |
# --- UPDATED PROMPTS ---
|
351 |
if summary_type == "paragraph":
|
352 |
-
|
353 |
-
"You are an AI
|
354 |
-
"
|
355 |
-
"
|
356 |
-
"
|
357 |
-
"
|
358 |
-
"
|
359 |
-
"
|
360 |
-
"
|
361 |
-
"
|
|
|
362 |
)
|
|
|
|
|
363 |
elif summary_type == "points":
|
364 |
-
|
365 |
-
"You are an AI
|
366 |
-
"
|
367 |
-
"
|
368 |
-
"
|
369 |
-
"
|
370 |
-
"
|
371 |
-
"
|
372 |
-
"
|
373 |
-
"
|
|
|
|
|
374 |
)
|
|
|
375 |
else:
|
376 |
-
logger.error(f"Invalid summary_type '{summary_type}'.")
|
377 |
-
return f"Error: Invalid summary type ('{summary_type}')."
|
378 |
# --- END UPDATED PROMPTS ---
|
379 |
|
380 |
-
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
try:
|
390 |
-
logger.debug(f"Sending request to OpenRouter (Model: {model_name})
|
391 |
-
|
|
|
392 |
logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
|
|
393 |
if response.status_code == 200:
|
394 |
try:
|
395 |
data = response.json()
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
else:
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
|
419 |
|
420 |
# --- Telegram Bot Handlers ---
|
421 |
|
422 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
423 |
-
|
424 |
-
|
425 |
-
|
|
|
|
|
426 |
mention = user.mention_html() if user.username else user.first_name
|
427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
|
429 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
430 |
-
|
431 |
-
|
432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
434 |
|
435 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
436 |
if not update.message or not update.message.text: return
|
437 |
-
message_text = update.message.text.strip()
|
438 |
-
|
439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
if match:
|
441 |
-
url = match.group(0)
|
442 |
-
|
443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
445 |
-
|
446 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
|
448 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
if not url:
|
455 |
-
logger.warning(f"User {user.id} pressed button, NO URL in context.")
|
456 |
-
try:
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
if not current_openrouter_key:
|
464 |
-
logger.error("OpenRouter key missing.")
|
465 |
-
try:
|
466 |
-
|
|
|
|
|
|
|
|
|
467 |
return
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
try:
|
476 |
-
|
477 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
478 |
if is_yt:
|
479 |
video_id = extract_youtube_id(url)
|
480 |
if video_id:
|
481 |
-
logger.info(f"Fetching
|
482 |
-
|
483 |
-
|
484 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
485 |
else:
|
486 |
-
|
487 |
-
|
|
|
|
|
|
|
|
|
488 |
else:
|
489 |
-
logger.warning(f"
|
490 |
if current_urltotext_key:
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
if content:
|
496 |
-
logger.info("Content
|
|
|
|
|
|
|
|
|
497 |
summary = await generate_summary(content, summary_type, current_openrouter_key)
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
502 |
except Exception as e:
|
503 |
-
|
504 |
-
|
505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
finally:
|
507 |
-
|
|
|
508 |
try:
|
509 |
-
|
510 |
-
|
511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
512 |
|
513 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
514 |
"""Log Errors caused by Updates."""
|
515 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
|
517 |
-
# --- Bot Setup Function (Modified:
|
518 |
async def setup_bot_config() -> Application:
|
519 |
"""Configures the PTB Application with custom HTTPX settings."""
|
520 |
logger.info("Configuring Telegram Application...")
|
@@ -522,44 +881,58 @@ async def setup_bot_config() -> Application:
|
|
522 |
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
|
523 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
524 |
|
525 |
-
# --- Configure HTTPX client settings
|
526 |
-
|
527 |
-
#
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
|
533 |
-
logger.info(f"Creating PTB HTTPXRequest with
|
534 |
-
f"
|
535 |
-
f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
|
537 |
# Create a custom request object with these settings
|
538 |
-
# connection_pool_size default is 10, which is usually fine.
|
539 |
custom_request = HTTPXRequest(
|
540 |
connect_timeout=connect_timeout,
|
541 |
read_timeout=read_timeout,
|
542 |
-
write_timeout=write_timeout,
|
543 |
pool_timeout=pool_timeout,
|
544 |
-
|
|
|
545 |
)
|
546 |
|
547 |
# Use Application.builder() and pass the custom request object
|
548 |
application_builder = Application.builder().token(TELEGRAM_TOKEN)
|
549 |
application_builder.request(custom_request)
|
550 |
-
|
|
|
|
|
|
|
551 |
|
552 |
# Build the application instance
|
553 |
application = application_builder.build()
|
554 |
|
555 |
-
# --- Register Handlers
|
556 |
application.add_handler(CommandHandler("start", start))
|
557 |
application.add_handler(CommandHandler("help", help_command))
|
|
|
558 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
|
|
559 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
|
|
560 |
application.add_error_handler(error_handler)
|
561 |
|
562 |
-
logger.info("Telegram handlers configured.")
|
563 |
return application
|
564 |
|
565 |
# --- ASGI Lifespan Context Manager ---
|
@@ -567,111 +940,198 @@ async def setup_bot_config() -> Application:
|
|
567 |
async def lifespan(app: Starlette):
|
568 |
"""Handles PTB startup and shutdown during ASGI lifespan."""
|
569 |
global ptb_app
|
570 |
-
logger.info("ASGI Lifespan: Startup
|
571 |
-
loop = asyncio.get_running_loop()
|
572 |
|
573 |
try:
|
|
|
574 |
ptb_app = await setup_bot_config()
|
575 |
-
logger.info("PTB
|
576 |
-
await ptb_app.initialize()
|
577 |
-
logger.info("PTB
|
|
|
578 |
await ptb_app.start()
|
579 |
-
|
|
|
|
|
|
|
580 |
|
|
|
|
|
581 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
582 |
if WEBHOOK_URL_BASE:
|
|
|
583 |
if not WEBHOOK_URL_BASE.startswith("https://"): WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
|
584 |
-
webhook_path = "/webhook"
|
585 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
586 |
-
|
|
|
|
|
|
|
587 |
try:
|
588 |
-
|
589 |
-
await
|
590 |
-
|
591 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
except RetryAfter as e:
|
593 |
-
|
|
|
|
|
|
|
|
|
|
|
594 |
except Exception as e:
|
595 |
-
logger.error(f"Failed to set webhook: {e}", exc_info=True)
|
596 |
-
else:
|
|
|
597 |
|
598 |
-
logger.info("ASGI Lifespan: Startup complete. Application ready.")
|
599 |
-
yield # Application runs here
|
600 |
|
601 |
except Exception as startup_err:
|
602 |
-
logger.critical(f"CRITICAL ERROR during ASGI startup: {startup_err}", exc_info=True)
|
|
|
603 |
raise
|
604 |
finally:
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
618 |
|
619 |
|
620 |
-
# --- Flask App Setup (for
|
|
|
|
|
621 |
flask_core_app = Flask(__name__)
|
622 |
-
logger.info("Core Flask app instance created (for routing
|
623 |
|
624 |
-
# --- Define Flask Routes
|
625 |
@flask_core_app.route('/')
|
626 |
def index():
|
627 |
"""Basic health check endpoint."""
|
628 |
-
logger.debug("Health check '/' accessed.")
|
629 |
-
bot_status = "
|
630 |
-
if ptb_app
|
631 |
-
|
632 |
-
|
|
|
|
|
633 |
|
634 |
@flask_core_app.route('/webhook', methods=['POST'])
|
635 |
async def webhook() -> Response:
|
636 |
-
"""Webhook endpoint
|
|
|
|
|
637 |
if not ptb_app:
|
638 |
-
logger.error("Webhook triggered, but PTB Application instance (ptb_app) is None.")
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
app = Starlette(
|
660 |
-
|
|
|
661 |
routes=[
|
|
|
|
|
662 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
663 |
]
|
664 |
)
|
665 |
-
logger.info("Starlette application created with lifespan and Flask app mounted at '/'.")
|
666 |
|
667 |
|
668 |
-
# ---
|
|
|
|
|
|
|
669 |
if __name__ == '__main__':
|
670 |
-
logger.warning("
|
671 |
-
logger.warning("
|
672 |
-
logger.warning("
|
673 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
674 |
else:
|
|
|
675 |
local_port = int(os.environ.get('PORT', 8080))
|
676 |
-
logger.info(f"Flask
|
677 |
-
|
|
|
|
|
|
1 |
+
# main.py (Revised: Increased Pool/Timeouts + Robust Callback Handling)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
|
|
24 |
CallbackQueryHandler,
|
25 |
)
|
26 |
from telegram.constants import ParseMode
|
27 |
+
from telegram.error import NetworkError, RetryAfter, TimedOut # Import TimedOut
|
28 |
from telegram.request import HTTPXRequest # Import the request class
|
29 |
|
30 |
# --- Other Libraries ---
|
31 |
+
import httpx # <<<--- ADDED IMPORT for httpx.Limits
|
32 |
from youtube_transcript_api import YouTubeTranscriptApi
|
33 |
import requests
|
34 |
from bs4 import BeautifulSoup
|
|
|
43 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
44 |
level=logging.DEBUG
|
45 |
)
|
46 |
+
# Reduce log spam from libraries
|
47 |
logging.getLogger("httpx").setLevel(logging.WARNING)
|
48 |
if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
|
49 |
logging.getLogger("telegram.ext").setLevel(logging.INFO)
|
|
|
52 |
logging.getLogger('gunicorn.error').setLevel(logging.INFO)
|
53 |
logging.getLogger('uvicorn').setLevel(logging.INFO)
|
54 |
logging.getLogger('starlette').setLevel(logging.INFO)
|
55 |
+
# Keep our app logger at DEBUG
|
56 |
logger = logging.getLogger(__name__)
|
57 |
logger.info("Logging configured.")
|
58 |
|
|
|
62 |
# --- Environment Variable Loading ---
|
63 |
logger.info("Attempting to load secrets...")
|
64 |
def get_secret(secret_name):
|
65 |
+
# logger.debug(f"Attempting to read secret: {secret_name}") # Optional: Less verbose startup
|
66 |
value = os.environ.get(secret_name)
|
67 |
if value: logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})")
|
68 |
else: logger.warning(f"Secret '{secret_name}': Not Found")
|
|
|
81 |
# get_transcript_via_supadata, get_transcript_via_apify,
|
82 |
# get_youtube_transcript, get_website_content_via_requests,
|
83 |
# get_website_content_via_urltotext_api, generate_summary)
|
|
|
84 |
|
85 |
# Helper Functions
|
86 |
def is_youtube_url(url):
|
|
|
112 |
params = {"videoId": video_id, "format": "text"}
|
113 |
headers = {"X-API-Key": api_key}
|
114 |
try:
|
115 |
+
# Consider removing verify=False if possible, or manage certificates properly
|
116 |
+
logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification - Potential Security Risk)")
|
117 |
response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
|
118 |
logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
|
119 |
if response.status_code == 200:
|
|
|
255 |
except Exception as e:
|
256 |
logger.warning(f"[Primary YT] Error via library: {type(e).__name__} - {e}")
|
257 |
if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e): logger.warning("[Primary YT] IP likely blocked.")
|
258 |
+
elif "No transcript found" in str(e): logger.warning("[Primary YT] No transcript in specified languages.")
|
259 |
+
elif "TranscriptsDisabled" in str(e) or "disabled" in str(e): logger.warning("[Primary YT] Transcripts disabled for this video.")
|
260 |
+
transcript_text = None # Ensure it's None on error
|
261 |
|
262 |
if transcript_text is None: # Fallback 1: Supadata
|
263 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
264 |
if supadata_key:
|
265 |
transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
|
266 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata. Length: {len(transcript_text)}"); return transcript_text
|
267 |
+
else: logger.warning("[Fallback YT 1] Supadata failed or no content found.")
|
268 |
+
else: logger.warning("[Fallback YT 1] Supadata key not available, skipping.")
|
269 |
|
270 |
if transcript_text is None: # Fallback 2: Apify
|
271 |
logger.info("[Fallback YT 2] Trying Apify API...")
|
272 |
if apify_token:
|
273 |
transcript_text = await get_transcript_via_apify(video_url, apify_token)
|
274 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify. Length: {len(transcript_text)}"); return transcript_text
|
275 |
+
else: logger.warning("[Fallback YT 2] Apify failed or no content found.")
|
276 |
+
else: logger.warning("[Fallback YT 2] Apify token not available, skipping.")
|
277 |
|
278 |
+
if transcript_text is None: logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
|
279 |
return transcript_text
|
280 |
|
281 |
# Website Content via Requests/BS4
|
282 |
async def get_website_content_via_requests(url):
|
283 |
"""Attempts to scrape website content using requests/BeautifulSoup."""
|
284 |
+
if not url: logger.error("[Web Scraper - Requests/BS4] get_website_content_via_requests called with no URL"); return None
|
285 |
+
logger.info(f"[Web Scraper - Requests/BS4] Attempting fetch: {url}")
|
286 |
try:
|
287 |
+
# Standard headers, avoid overly aggressive scraping patterns
|
288 |
+
headers = {
|
289 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', # Updated UA
|
290 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
291 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
292 |
+
'Connection': 'keep-alive',
|
293 |
+
'DNT': '1', # Do Not Track header
|
294 |
+
'Upgrade-Insecure-Requests': '1'
|
295 |
+
}
|
296 |
response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
|
297 |
+
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
298 |
logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
|
299 |
+
|
300 |
content_type = response.headers.get('content-type', '').lower()
|
301 |
if 'html' not in content_type:
|
302 |
+
logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received: {content_type}. Attempting plain text extraction.")
|
303 |
+
# Allow plain text only if explicitly text/plain
|
304 |
+
if 'text/plain' in content_type and response.text:
|
305 |
+
logger.info(f"[Web Scraper - Requests/BS4] Extracted plain text content. Length: {len(response.text.strip())}")
|
306 |
+
return response.text.strip()
|
307 |
+
logger.warning(f"[Web Scraper - Requests/BS4] Content type '{content_type}' not suitable for parsing. Aborting.")
|
308 |
return None
|
309 |
+
|
310 |
soup = BeautifulSoup(response.text, 'html.parser')
|
311 |
+
|
312 |
+
# Remove common non-content tags more aggressively
|
313 |
+
tags_to_remove = ["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio", "picture", "source"]
|
314 |
+
# Also remove elements often used for ads or menus by class/id
|
315 |
+
selectors_to_remove = ['.ad', '#ad', '.ads', '#ads', '.advertisement', '#advertisement', '.banner', '#banner', '.menu', '#menu', '.navigation', '#navigation', '.sidebar', '#sidebar', '.social', '#social', '.share', '#share', '.related', '#related', '.comments', '#comments', '.cookie-consent', '#cookie-consent']
|
316 |
+
|
317 |
+
for tag in soup(tags_to_remove): tag.decompose()
|
318 |
+
for selector in selectors_to_remove:
|
319 |
+
for element in soup.select(selector): element.decompose()
|
320 |
+
|
321 |
+
# Try to find semantic main content areas first
|
322 |
+
main_content = soup.find('main') or \
|
323 |
+
soup.find('article') or \
|
324 |
+
soup.find(id='content') or \
|
325 |
+
soup.find(class_='content') or \
|
326 |
+
soup.find(id='main-content') or \
|
327 |
+
soup.find(class_='main-content') or \
|
328 |
+
soup.find(role='main')
|
329 |
+
|
330 |
target_element = main_content if main_content else soup.body
|
331 |
+
if not target_element:
|
332 |
+
logger.warning(f"[Web Scraper - Requests/BS4] Could not find a suitable target element (main, article, body) for {url}");
|
333 |
+
return None
|
334 |
+
|
335 |
+
# Extract text, attempting to preserve paragraphs better
|
336 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
337 |
+
text = "\n\n".join(lines) # Join lines with double newline for paragraph separation
|
338 |
+
|
339 |
+
MIN_TEXT_LENGTH = 100 # Increased minimum length
|
340 |
+
if not text or len(text) < MIN_TEXT_LENGTH:
|
341 |
+
logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is too short (<{MIN_TEXT_LENGTH} chars) after cleaning for {url}. Length: {len(text)}. Content might be JS-rendered or blocked.")
|
342 |
+
# Optional: Log the short text for debugging: logger.debug(f"Short text: {text[:500]}")
|
343 |
+
return None # Treat very short text as failure
|
344 |
+
|
345 |
+
logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped and cleaned content from {url}. Final Length: {len(text)}")
|
346 |
return text
|
347 |
+
|
348 |
+
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout error fetching {url}"); return None
|
349 |
+
except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error for {url}"); return None
|
350 |
+
except requests.exceptions.HTTPError as e: logger.error(f"[Web Scraper - Requests/BS4] HTTP error {e.response.status_code} for {url}"); return None
|
351 |
+
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] General request error for {url}: {e}"); return None
|
352 |
+
except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Error during parsing or processing {url}: {e}", exc_info=True); return None
|
353 |
|
354 |
# Website Content via URLToText API
|
355 |
async def get_website_content_via_urltotext_api(url: str, api_key: str):
|
356 |
"""Fetches website content using the URLToText API."""
|
357 |
+
if not url: logger.error("[Web Scraper - URLToText API] get_website_content_via_urltotext_api called with no URL"); return None
|
358 |
+
if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
|
359 |
+
logger.info(f"[Web Scraper - URLToText API] Attempting fetch via API: {url}")
|
360 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
361 |
+
# Ensure payload includes options beneficial for scraping modern sites
|
362 |
+
payload = json.dumps({
|
363 |
+
"url": url,
|
364 |
+
"output_format": "text",
|
365 |
+
"extract_main_content": True, # Try to get just the core article/content
|
366 |
+
"render_javascript": True, # Crucial for JS-heavy sites
|
367 |
+
"residential_proxy": False, # Set to True if facing blocks, requires appropriate plan
|
368 |
+
"timeout_render": 20000, # Increase JS render timeout (in ms)
|
369 |
+
})
|
370 |
headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
|
371 |
try:
|
372 |
+
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=60) # Increased overall timeout
|
373 |
+
logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
|
374 |
if response.status_code == 200:
|
375 |
try:
|
376 |
data = response.json()
|
377 |
+
content_data = data.get("data", {})
|
378 |
+
content = content_data.get("content")
|
379 |
+
credits = data.get("credits_used", "N/A")
|
380 |
+
warning = content_data.get("warning")
|
381 |
+
error_msg = content_data.get("error") # Check for specific error in response data
|
382 |
+
|
383 |
+
if warning: logger.warning(f"[Web Scraper - URLToText API] API Warning for {url}: {warning}")
|
384 |
+
if error_msg: logger.error(f"[Web Scraper - URLToText API] API Error reported for {url}: {error_msg}"); return None # Treat API error as failure
|
385 |
+
|
386 |
+
if content and isinstance(content, str):
|
387 |
+
logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API. Length: {len(content.strip())}. Credits Used: {credits}");
|
388 |
+
return content.strip()
|
389 |
+
else:
|
390 |
+
logger.warning(f"[Web Scraper - URLToText API] API returned status 200 but content is empty or invalid for {url}. Response: {data}");
|
391 |
+
return None
|
392 |
+
except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response from API. Status: {response.status_code}. Response Text: {response.text[:500]}..."); return None
|
393 |
+
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing successful API response: {e}", exc_info=True); return None
|
394 |
+
elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400) to API. Check payload/URL. Response: {response.text[:200]}...")
|
395 |
+
elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401). Check API Key. Response: {response.text[:200]}...")
|
396 |
+
elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402). Check API credits/plan. Response: {response.text[:200]}...")
|
397 |
+
elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL / Fetch Error (422) reported by API for {url}. Response: {response.text[:200]}...") # Might mean the site blocked the API
|
398 |
+
elif response.status_code == 429: logger.warning(f"[Web Scraper - URLToText API] Rate Limit Hit (429). Response: {response.text[:200]}...")
|
399 |
+
elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] API Server Error ({response.status_code}). Response: {response.text[:200]}...")
|
400 |
+
else: logger.error(f"[Web Scraper - URLToText API] Unexpected status code {response.status_code} from API. Response: {response.text[:200]}...")
|
401 |
+
return None # Return None for all non-200 responses after logging
|
402 |
+
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout connecting to API for {url}"); return None
|
403 |
+
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API: {e}"); return None
|
404 |
+
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call: {e}", exc_info=True); return None
|
405 |
|
406 |
# DeepSeek Summary Function (with updated prompts)
|
407 |
async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
408 |
"""Generates summary using DeepSeek via OpenRouter API."""
|
409 |
logger.info(f"Generating '{summary_type}' summary. Input length: {len(text)}")
|
410 |
+
if not api_key: logger.error("OpenRouter API key missing."); return "Error: AI service configuration key is missing."
|
411 |
+
if not text or not text.strip(): logger.warning("generate_summary called with empty or whitespace-only text."); return "Error: No content was provided to summarize."
|
412 |
+
|
413 |
+
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
414 |
+
# Consider using a non-free model if rate limits are hit or quality needed
|
415 |
+
model_name = "deepseek/deepseek-chat:free"
|
416 |
+
# model_name = "openai/gpt-3.5-turbo" # Example alternative
|
417 |
|
418 |
# --- UPDATED PROMPTS ---
|
419 |
if summary_type == "paragraph":
|
420 |
+
system_message = (
|
421 |
+
"You are an expert summarization AI. Your goal is to provide a concise, easy-to-understand summary of the provided text. "
|
422 |
+
"Follow these instructions precisely:\n"
|
423 |
+
"1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
|
424 |
+
"2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
|
425 |
+
"3. **Format:** Output a single paragraph only.\n"
|
426 |
+
"4. **Conciseness:** The summary must be **no more than 85 words** long.\n"
|
427 |
+
"5. **Completeness:** Cover the main points from the entire text, not just the start.\n"
|
428 |
+
"6. **Punctuation:** Do NOT use em dashes (– or —). Use semicolons (;) if needed for complex sentence structure, but prefer simpler sentences.\n"
|
429 |
+
"7. **Tone:** Maintain a neutral and informative tone.\n"
|
430 |
+
"8. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
|
431 |
)
|
432 |
+
user_prompt_instruction = "Summarize the following text into a single paragraph adhering strictly to the rules outlined in the system message:"
|
433 |
+
|
434 |
elif summary_type == "points":
|
435 |
+
system_message = (
|
436 |
+
"You are an expert summarization AI. Your goal is to extract the key points from the provided text and present them as a bulleted list. "
|
437 |
+
"Follow these instructions precisely:\n"
|
438 |
+
"1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
|
439 |
+
"2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
|
440 |
+
"3. **Format:** Output as a bulleted list. Start each point with a standard bullet character ('*' or '-'). Each point should be distinct and on a new line.\n"
|
441 |
+
"4. **Content:** Each bullet point should represent a single key finding, main topic, or significant piece of information from the text.\n"
|
442 |
+
"5. **Conciseness:** Keep each bullet point brief and to the point.\n"
|
443 |
+
"6. **Completeness:** Cover the main points from the entire text, not just the start.\n"
|
444 |
+
"7. **Punctuation:** Do NOT use em dashes (– or —) within bullet points.\n"
|
445 |
+
"8. **Tone:** Maintain a neutral and informative tone.\n"
|
446 |
+
"9. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
|
447 |
)
|
448 |
+
user_prompt_instruction = "Summarize the following text into a bulleted list adhering strictly to the rules outlined in the system message:"
|
449 |
else:
|
450 |
+
logger.error(f"Invalid summary_type '{summary_type}' requested.")
|
451 |
+
return f"Error: Invalid summary type ('{summary_type}') requested. Please choose 'paragraph' or 'points'."
|
452 |
# --- END UPDATED PROMPTS ---
|
453 |
|
454 |
+
# Practical limit for API context window / cost control
|
455 |
+
# Deepseek context might be larger, but set a reasonable app limit
|
456 |
+
MAX_INPUT_TOKENS_ESTIMATE = 28000 # Rough estimate for deepseek-chat's context limit (aim lower than max)
|
457 |
+
# Simple character length heuristic (adjust based on typical content)
|
458 |
+
AVG_CHARS_PER_TOKEN = 4
|
459 |
+
MAX_INPUT_LENGTH = MAX_INPUT_TOKENS_ESTIMATE * AVG_CHARS_PER_TOKEN
|
460 |
+
|
461 |
+
if len(text) > MAX_INPUT_LENGTH:
|
462 |
+
logger.warning(f"Input text length ({len(text)} chars) exceeds estimated limit ({MAX_INPUT_LENGTH}). Truncating.")
|
463 |
+
truncation_marker = "\n\n[... Text truncated due to length ...]"
|
464 |
+
text = text[:MAX_INPUT_LENGTH - len(truncation_marker)] + truncation_marker
|
465 |
+
|
466 |
+
# Construct the messages payload for the API
|
467 |
+
messages = [
|
468 |
+
{"role": "system", "content": system_message},
|
469 |
+
{"role": "user", "content": f"{user_prompt_instruction}\n\n--- TEXT TO SUMMARIZE ---\n\n{text}\n\n--- END OF TEXT ---"}
|
470 |
+
]
|
471 |
|
472 |
+
# Referer and Title for OpenRouter identification
|
473 |
+
space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME") # Replace default if needed
|
474 |
+
referer_url = f"https://{space_host}" if space_host and not space_host.startswith("http") else space_host or "https://huggingface.co"
|
475 |
+
headers = {
|
476 |
+
"Authorization": f"Bearer {api_key}",
|
477 |
+
"Content-Type": "application/json",
|
478 |
+
"HTTP-Referer": referer_url,
|
479 |
+
"X-Title": "Telegram URL Summarizer Bot" # Or your bot's name
|
480 |
+
}
|
481 |
+
payload = json.dumps({"model": model_name, "messages": messages})
|
482 |
|
483 |
try:
|
484 |
+
logger.debug(f"Sending request to OpenRouter (Model: {model_name}). Prompt length approx: {len(text)} chars.")
|
485 |
+
# Increased timeout for potentially long AI generation
|
486 |
+
response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=120)
|
487 |
logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
488 |
+
|
489 |
if response.status_code == 200:
|
490 |
try:
|
491 |
data = response.json()
|
492 |
+
# Check for response structure variations
|
493 |
+
choice = data.get("choices", [{}])[0]
|
494 |
+
message = choice.get("message", {})
|
495 |
+
summary = message.get("content")
|
496 |
+
finish_reason = choice.get("finish_reason")
|
497 |
+
|
498 |
+
if summary and isinstance(summary, str) and summary.strip():
|
499 |
+
summary = summary.strip()
|
500 |
+
logger.info(f"Successfully generated summary. Finish Reason: {finish_reason}. Length: {len(summary)}")
|
501 |
+
# Optional: Add post-processing checks (e.g., length for paragraph)
|
502 |
+
if summary_type == "paragraph" and len(summary.split()) > 95: # Allow slight overrun from 85 words
|
503 |
+
logger.warning(f"Generated paragraph summary slightly longer than target word count ({len(summary.split())} words).")
|
504 |
+
return summary
|
505 |
+
else:
|
506 |
+
logger.warning(f"OpenRouter returned status 200 but summary content is missing or empty. Response data: {data}")
|
507 |
+
return "Sorry, the AI model returned an empty summary. The content might have been unsuitable."
|
508 |
+
|
509 |
+
except (json.JSONDecodeError, IndexError, KeyError, AttributeError) as e:
|
510 |
+
logger.error(f"Failed to parse successful (200) response from OpenRouter. Error: {e}. Response Text: {response.text[:500]}...", exc_info=True)
|
511 |
+
return "Sorry, there was an issue parsing the response from the AI service."
|
512 |
+
except Exception as e:
|
513 |
+
logger.error(f"Unexpected error processing OpenRouter success response: {e}", exc_info=True)
|
514 |
+
return "Sorry, an unexpected error occurred while processing the AI response."
|
515 |
+
|
516 |
+
# Handle specific HTTP error codes from OpenRouter
|
517 |
+
elif response.status_code == 401: logger.error("OpenRouter API key is invalid (Unauthorized - 401)."); return "Error: AI service authentication failed. Please check the configuration."
|
518 |
+
elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check credits/limits."); return "Sorry, there's an issue with the AI service account limits or payment."
|
519 |
+
elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Hit (429)."); return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
|
520 |
+
elif response.status_code == 400: logger.error(f"OpenRouter Bad Request (400). Likely prompt issue. Response: {response.text[:500]}..."); return "Sorry, the request to the AI service was invalid (possibly due to the content or prompt)."
|
521 |
+
elif response.status_code >= 500: logger.error(f"OpenRouter Server Error ({response.status_code}). Response: {response.text[:500]}..."); return "Sorry, the AI service is experiencing internal issues. Please try again later."
|
522 |
else:
|
523 |
+
# Handle other unexpected errors
|
524 |
+
logger.error(f"Unexpected HTTP status {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
|
525 |
+
try: # Try to extract an error message from the response body
|
526 |
+
error_data = response.json()
|
527 |
+
error_msg = error_data.get("error", {}).get("message", response.text[:100])
|
528 |
+
return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
|
529 |
+
except json.JSONDecodeError:
|
530 |
+
return f"Sorry, the AI service returned an unexpected error (Status: {response.status_code})."
|
531 |
+
|
532 |
+
except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter API."); return "Sorry, the request to the AI model timed out. Please try again."
|
533 |
+
except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was a network error connecting to the AI model service."
|
534 |
+
except Exception as e: logger.error(f"Unexpected error occurred within generate_summary function: {e}", exc_info=True); return "Sorry, an unexpected internal error occurred while generating the summary."
|
535 |
|
536 |
|
537 |
# --- Telegram Bot Handlers ---
|
538 |
|
539 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
540 |
+
"""Handles the /start command."""
|
541 |
+
user = update.effective_user
|
542 |
+
if not user: return # Should not happen with a command
|
543 |
+
logger.info(f"User {user.id} ({user.username or 'NoUsername'}) initiated /start.")
|
544 |
+
# Use mention_html for linking username if available, otherwise just first name
|
545 |
mention = user.mention_html() if user.username else user.first_name
|
546 |
+
start_message = (
|
547 |
+
f"👋 Hello {mention}!\n\n"
|
548 |
+
"I can summarise YouTube videos or web articles for you.\n\n"
|
549 |
+
"Just send me a link (URL) and I'll ask you whether you want the summary as a paragraph or bullet points.\n\n"
|
550 |
+
"Type /help for more details."
|
551 |
+
)
|
552 |
+
await update.message.reply_html(start_message)
|
553 |
|
554 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
555 |
+
"""Handles the /help command."""
|
556 |
+
user = update.effective_user
|
557 |
+
logger.info(f"User {user.id if user else '?'} requested /help.")
|
558 |
+
help_text = (
|
559 |
+
"**How to Use Me:**\n"
|
560 |
+
"1. Send me a direct link (URL) to a YouTube video or a web article.\n"
|
561 |
+
"2. I will ask you to choose the summary format: `Paragraph` or `Points`.\n"
|
562 |
+
"3. Click the button for your preferred format.\n"
|
563 |
+
"4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
|
564 |
+
"**Important Notes:**\n"
|
565 |
+
"- **YouTube:** Getting transcripts can sometimes fail if they are disabled, unavailable for the video's language, or if YouTube temporarily blocks requests.\n"
|
566 |
+
"- **Websites:** I do my best to extract the main article content, but complex websites (especially those heavily reliant on JavaScript or with strong anti-scraping measures) might not work perfectly. I have a fallback service to help with tricky sites.\n"
|
567 |
+
"- **AI Summaries:** The AI tries its best to be accurate and follow the requested format, but errors or unexpected outputs are possible.\n"
|
568 |
+
"- **Length:** Very long articles or videos might be truncated before summarization to fit within processing limits.\n\n"
|
569 |
+
"Just send a link to get started!"
|
570 |
+
)
|
571 |
+
# Use MarkdownV2 for better formatting control if needed, but MARKDOWN is simpler
|
572 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
573 |
|
574 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
575 |
+
"""Handles messages containing potential URLs."""
|
576 |
if not update.message or not update.message.text: return
|
577 |
+
message_text = update.message.text.strip()
|
578 |
+
user = update.effective_user
|
579 |
+
if not user: return # Should not happen with a message
|
580 |
+
|
581 |
+
# More robust URL regex (handles various protocols, domains, paths, queries)
|
582 |
+
# Still simple, not aiming for perfect RFC 3986 validation
|
583 |
+
url_pattern = r'https?://(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(?:/[^\s]*)?'
|
584 |
+
match = re.search(url_pattern, message_text)
|
585 |
+
|
586 |
if match:
|
587 |
+
url = match.group(0)
|
588 |
+
logger.info(f"User {user.id} sent potential URL: {url}")
|
589 |
+
|
590 |
+
# Store URL in user_data, associated with the user ID
|
591 |
+
context.user_data['url_to_summarize'] = url
|
592 |
+
logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
|
593 |
+
|
594 |
+
keyboard = [
|
595 |
+
[
|
596 |
+
InlineKeyboardButton("📜 Paragraph Summary", callback_data="paragraph"),
|
597 |
+
InlineKeyboardButton("🔹 Bullet Points", callback_data="points")
|
598 |
+
]
|
599 |
+
]
|
600 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
601 |
+
|
602 |
+
# Send message asking for summary type
|
603 |
+
await update.message.reply_text(
|
604 |
+
f"✅ Link received:\n`{url}`\n\nChoose your desired summary format:",
|
605 |
+
reply_markup=reply_markup,
|
606 |
+
parse_mode=ParseMode.MARKDOWN,
|
607 |
+
link_preview_options={'is_disabled': True} # Disable link preview for this message
|
608 |
+
)
|
609 |
+
else:
|
610 |
+
# If it doesn't look like a URL, maybe provide guidance?
|
611 |
+
# logger.debug(f"Ignoring non-URL message from {user.id}: {message_text[:100]}")
|
612 |
+
# Optional: Reply if it's not a command and not a URL
|
613 |
+
if not message_text.startswith('/'):
|
614 |
+
await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
|
615 |
+
|
616 |
|
617 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
618 |
+
"""Handles button presses for choosing summary type."""
|
619 |
+
query = update.callback_query
|
620 |
+
if not query or not query.from_user:
|
621 |
+
logger.warning("Callback query or user missing in update.")
|
622 |
+
return # Can't proceed without query/user
|
623 |
+
user = query.from_user
|
624 |
+
|
625 |
+
# --- Answer Callback Query Immediately ---
|
626 |
+
try:
|
627 |
+
await query.answer() # Acknowledge the button press
|
628 |
+
logger.debug(f"Answered callback query {query.id} for user {user.id}")
|
629 |
+
except TimedOut:
|
630 |
+
# Log timeout but proceed; the button loading indicator might just hang for the user
|
631 |
+
logger.warning(f"Timeout answering callback query {query.id} for user {user.id}. Processing continues.")
|
632 |
+
except Exception as e:
|
633 |
+
# Log other errors but proceed cautiously. The button might remain "loading".
|
634 |
+
logger.error(f"Error answering callback query {query.id} for user {user.id}: {e!r}", exc_info=True)
|
635 |
+
|
636 |
+
summary_type = query.data # 'paragraph' or 'points'
|
637 |
+
# Retrieve URL stored earlier for this user
|
638 |
+
url = context.user_data.get('url_to_summarize')
|
639 |
+
logger.info(f"User {user.id} chose summary type '{summary_type}'. Checking for stored URL.")
|
640 |
+
|
641 |
if not url:
|
642 |
+
logger.warning(f"User {user.id} pressed button '{summary_type}', but NO URL found in user_data context.")
|
643 |
+
try:
|
644 |
+
# Inform user context was lost (e.g., bot restarted, long delay)
|
645 |
+
await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
|
646 |
+
except TimedOut:
|
647 |
+
logger.error(f"Timeout trying to edit message to inform user {user.id} about lost context.")
|
648 |
+
except Exception as edit_err:
|
649 |
+
# Log error if editing fails (message might already be gone, or other Telegram issue)
|
650 |
+
logger.error(f"Failed to edit message for lost context for user {user.id}: {edit_err}")
|
651 |
+
return # Stop processing if URL is missing
|
652 |
+
|
653 |
+
# --- URL Found - Proceed with Processing ---
|
654 |
+
logger.info(f"Processing URL '{url}' for user {user.id} with type '{summary_type}'.")
|
655 |
+
# Clear the URL from context now that we're processing it
|
656 |
+
context.user_data.pop('url_to_summarize', None)
|
657 |
+
logger.debug(f"Cleared URL from user_data for user {user.id}")
|
658 |
+
|
659 |
+
# Fetch current API keys (allows for potential runtime changes, though unlikely here)
|
660 |
+
current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
|
661 |
+
current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
|
662 |
+
current_supadata_key = os.environ.get('SUPADATA_API_KEY')
|
663 |
+
current_apify_token = os.environ.get('APIFY_API_TOKEN')
|
664 |
+
# Simple check log
|
665 |
+
keys_present = f"OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}"
|
666 |
+
logger.debug(f"API Key check for user {user.id} request: {keys_present}")
|
667 |
+
|
668 |
+
# Critical dependency check: AI key
|
669 |
if not current_openrouter_key:
|
670 |
+
logger.error(f"CRITICAL: OpenRouter API key is missing. Cannot generate summary for user {user.id}.")
|
671 |
+
try:
|
672 |
+
await query.edit_message_text(text="❌ Configuration Error: The AI summarization service is not configured correctly. Please contact the administrator.")
|
673 |
+
except TimedOut:
|
674 |
+
logger.error(f"Timeout editing message to inform user {user.id} about missing AI key.")
|
675 |
+
except Exception as edit_err:
|
676 |
+
logger.error(f"Failed to edit message for missing AI key for user {user.id}: {edit_err}")
|
677 |
return
|
678 |
+
|
679 |
+
# --- Inform User Processing Has Started ---
|
680 |
+
processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
|
681 |
+
message_to_edit = query.message # The message with the buttons
|
682 |
+
status_message_sent = None # Will hold msg ID if we send a new status message
|
683 |
+
|
684 |
+
try:
|
685 |
+
if message_to_edit:
|
686 |
+
await query.edit_message_text(text=processing_message_text)
|
687 |
+
logger.debug(f"Edited original message {message_to_edit.message_id} to show 'Working...' status for query {query.id}")
|
688 |
+
else:
|
689 |
+
# This case should be rare if query.message exists, but handle defensively
|
690 |
+
logger.warning(f"Original message (query.message) not found for query {query.id}. Cannot edit, will send new status message.")
|
691 |
+
raise ValueError("Original message object missing") # Force fallback to sending new message
|
692 |
+
except (TimedOut, Exception) as e:
|
693 |
+
# If editing fails (e.g., message too old, deleted, rate limit), try sending a new message
|
694 |
+
logger.warning(f"Could not edit original message {message_to_edit.message_id if message_to_edit else 'N/A'} for query {query.id}: {e!r}. Attempting to send a new status message.")
|
695 |
+
message_to_edit = None # Ensure we don't try to delete this later if editing failed
|
696 |
+
try:
|
697 |
+
status_message_sent = await context.bot.send_message(chat_id=user.id, text=processing_message_text)
|
698 |
+
logger.debug(f"Sent new status message {status_message_sent.message_id} to user {user.id}.")
|
699 |
+
except TimedOut:
|
700 |
+
logger.error(f"Timeout sending NEW 'Working...' status message to user {user.id}. Processing continues without feedback.")
|
701 |
+
# User won't know bot is working - proceed anyway, hope for the best.
|
702 |
+
except Exception as send_err:
|
703 |
+
logger.error(f"Failed sending NEW 'Working...' status message to user {user.id}: {send_err}. Processing continues without feedback.")
|
704 |
+
# As above.
|
705 |
+
|
706 |
+
# --- Main Content Fetching and Summarization ---
|
707 |
+
content = None
|
708 |
+
user_feedback_message = None # Holds error/status messages for the user
|
709 |
+
success = False # Tracks if we successfully sent a summary
|
710 |
+
|
711 |
try:
|
712 |
+
# Send 'typing' action to indicate activity
|
713 |
+
try:
|
714 |
+
logger.debug(f"Sending 'typing' chat action to chat {user.id}")
|
715 |
+
await context.bot.send_chat_action(chat_id=user.id, action='typing')
|
716 |
+
except TimedOut: logger.warning(f"Timeout sending 'typing' action for user {user.id}.")
|
717 |
+
except Exception as ca_err: logger.warning(f"Failed sending 'typing' action for user {user.id}: {ca_err}")
|
718 |
+
|
719 |
+
# --- Determine Content Type and Fetch ---
|
720 |
+
is_yt = is_youtube_url(url)
|
721 |
+
logger.debug(f"URL ({url}) is YouTube: {is_yt} (User: {user.id})")
|
722 |
+
|
723 |
if is_yt:
|
724 |
video_id = extract_youtube_id(url)
|
725 |
if video_id:
|
726 |
+
logger.info(f"Fetching YouTube transcript for video ID: {video_id} (User: {user.id})")
|
727 |
+
content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
|
728 |
+
if not content:
|
729 |
+
logger.warning(f"Failed to get YouTube transcript for {video_id} (User: {user.id}).")
|
730 |
+
user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video. It might be unavailable, private, have captions disabled, or an error occurred."
|
731 |
+
else:
|
732 |
+
logger.info(f"Successfully fetched YouTube transcript for {video_id}. Length: {len(content)} (User: {user.id})")
|
733 |
+
else:
|
734 |
+
logger.warning(f"Failed to extract YouTube video ID from URL: {url} (User: {user.id})")
|
735 |
+
user_feedback_message = "⚠️ Sorry, I couldn't identify a valid YouTube video ID in the link you provided."
|
736 |
else:
|
737 |
+
# --- Website Scraping ---
|
738 |
+
logger.info(f"Attempting website scrape (Requests/BS4) for URL: {url} (User: {user.id})")
|
739 |
+
content = await get_website_content_via_requests(url)
|
740 |
+
if content:
|
741 |
+
logger.info(f"Website scrape successful (Requests/BS4). Length: {len(content)} (User: {user.id})")
|
742 |
+
# Content found, no need for feedback message yet
|
743 |
else:
|
744 |
+
logger.warning(f"Primary website scrape failed for {url} (User: {user.id}). Trying fallback API.")
|
745 |
if current_urltotext_key:
|
746 |
+
# Send typing again if first scrape failed and we try another method
|
747 |
+
try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before fallback scrape.")
|
748 |
+
except: pass # Ignore if fails
|
749 |
+
|
750 |
+
logger.info(f"Attempting website scrape via URLToText API for: {url} (User: {user.id})")
|
751 |
+
content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
|
752 |
+
if content:
|
753 |
+
logger.info(f"Website scrape successful via URLToText API. Length: {len(content)} (User: {user.id})")
|
754 |
+
else:
|
755 |
+
logger.warning(f"Fallback website scrape (URLToText API) also failed for {url} (User: {user.id}).")
|
756 |
+
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website using available methods. It might be protected or structured in a way I can't parse."
|
757 |
+
else:
|
758 |
+
# Fallback key missing
|
759 |
+
logger.warning(f"Primary scrape failed and URLToText API key not configured. Cannot fallback for {url} (User: {user.id}).")
|
760 |
+
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website, and the fallback service isn't configured."
|
761 |
+
|
762 |
+
# --- Generate Summary if Content Was Fetched ---
|
763 |
if content:
|
764 |
+
logger.info(f"Content fetched (Length: {len(content)}). Generating '{summary_type}' summary for user {user.id}.")
|
765 |
+
# Send typing before potentially long AI call
|
766 |
+
try: await context.bot.send_chat_action(chat_id=user.id, action='typing'); logger.debug("Sent typing before AI summary generation.")
|
767 |
+
except: pass
|
768 |
+
|
769 |
summary = await generate_summary(content, summary_type, current_openrouter_key)
|
770 |
+
|
771 |
+
# Check if summary generation returned an error message
|
772 |
+
if summary.startswith("Error:") or summary.startswith("Sorry,"):
|
773 |
+
logger.warning(f"AI summary generation failed for user {user.id}. Reason: {summary}")
|
774 |
+
user_feedback_message = f"⚠️ {summary}" # Use the error message from generate_summary
|
775 |
+
else:
|
776 |
+
# --- Summary Success - Send to User ---
|
777 |
+
logger.info(f"Summary generated successfully for user {user.id}. Length: {len(summary)}. Sending result.")
|
778 |
+
try:
|
779 |
+
await context.bot.send_message(
|
780 |
+
chat_id=user.id,
|
781 |
+
text=summary,
|
782 |
+
parse_mode=ParseMode.MARKDOWN, # Assuming AI generates markdown for points
|
783 |
+
link_preview_options={'is_disabled': True}
|
784 |
+
)
|
785 |
+
success = True
|
786 |
+
user_feedback_message = None # Clear any previous fetching error message
|
787 |
+
logger.info(f"Successfully sent summary to user {user.id}.")
|
788 |
+
except TimedOut:
|
789 |
+
logger.error(f"Timeout sending final summary message to user {user.id}.")
|
790 |
+
user_feedback_message = "⚠️ Sorry, there was a timeout while trying to send you the final summary."
|
791 |
+
success = False # Mark as failed if sending timed out
|
792 |
+
except Exception as send_final_err:
|
793 |
+
logger.error(f"Failed sending final summary to user {user.id}: {send_final_err}", exc_info=True)
|
794 |
+
user_feedback_message = "⚠️ Sorry, an unexpected error occurred while sending the final summary."
|
795 |
+
success = False # Mark as failed
|
796 |
+
|
797 |
+
elif not user_feedback_message:
|
798 |
+
# If content is None, but no specific error message was set above, set a generic one.
|
799 |
+
logger.warning(f"Content retrieval resulted in None, but no specific user feedback message was set. URL: {url} (User: {user.id})")
|
800 |
+
user_feedback_message = "⚠️ Sorry, I couldn't retrieve any usable content from the link provided."
|
801 |
+
|
802 |
+
# --- Send Final Feedback Message if Processing Failed ---
|
803 |
+
if user_feedback_message and not success:
|
804 |
+
logger.warning(f"Processing failed or summary sending failed for user {user.id}. Sending feedback: {user_feedback_message}")
|
805 |
+
try:
|
806 |
+
await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
|
807 |
+
except TimedOut:
|
808 |
+
logger.error(f"Timeout sending final FAILURE feedback message to user {user.id}.")
|
809 |
+
except Exception as send_feedback_err:
|
810 |
+
logger.error(f"Failed sending final FAILURE feedback message to user {user.id}: {send_feedback_err}")
|
811 |
+
|
812 |
except Exception as e:
|
813 |
+
# Catch-all for unexpected errors during the main processing block
|
814 |
+
logger.error(f"Unexpected critical error during callback processing for user {user.id}, URL {url}: {e}", exc_info=True)
|
815 |
+
try:
|
816 |
+
# Send a generic error message to the user
|
817 |
+
await context.bot.send_message(chat_id=user.id, text="❌ Oops! An unexpected internal error occurred while processing your request. The issue has been logged.")
|
818 |
+
except TimedOut:
|
819 |
+
logger.error(f"Timeout sending CRITICAL internal error feedback message to user {user.id}.")
|
820 |
+
except Exception as final_err:
|
821 |
+
# If even sending the error message fails, log it.
|
822 |
+
logger.error(f"Failed sending CRITICAL internal error feedback message to user {user.id}: {final_err}")
|
823 |
+
# Ensure success is False if we hit this block
|
824 |
+
success = False
|
825 |
+
|
826 |
finally:
|
827 |
+
# --- Clean up Status Message(s) ---
|
828 |
+
logger.debug(f"Cleaning up status message(s) for user {user.id}, query {query.id}. Success={success}")
|
829 |
try:
|
830 |
+
if status_message_sent:
|
831 |
+
# If we sent a separate "Working..." message, delete it regardless of success/failure
|
832 |
+
# as the final result or error message has been (or attempted to be) sent.
|
833 |
+
await context.bot.delete_message(chat_id=user.id, message_id=status_message_sent.message_id)
|
834 |
+
logger.debug(f"Deleted separate status message {status_message_sent.message_id} for user {user.id}.")
|
835 |
+
elif message_to_edit:
|
836 |
+
# If we edited the original message with the buttons...
|
837 |
+
if success:
|
838 |
+
# If processing succeeded, delete the "Working..." message.
|
839 |
+
await query.delete_message()
|
840 |
+
logger.debug(f"Processing succeeded. Deleted original (edited) message {message_to_edit.message_id} for query {query.id}.")
|
841 |
+
else:
|
842 |
+
# If processing failed, *don't* delete the message.
|
843 |
+
# It either still shows "Working..." (if sending final error failed)
|
844 |
+
# or it might show an error message if edit_message_text was used for that.
|
845 |
+
# Let's try to edit it one last time to show a generic failure if no specific feedback was sent.
|
846 |
+
# This is complex, maybe just leave it as is for simplicity.
|
847 |
+
logger.debug(f"Processing failed. Leaving edited message {message_to_edit.message_id} in place for query {query.id}.")
|
848 |
+
# Optional: Try one last edit to show failure if needed, but might be overkill
|
849 |
+
# if not user_feedback_message: # Only if no other error was sent
|
850 |
+
# try: await query.edit_message_text("❌ Processing failed.")
|
851 |
+
# except: pass # Ignore errors here
|
852 |
+
|
853 |
+
# If message_to_edit was None (original edit failed) and status_message_sent was None (sending new status failed), there's nothing to delete here.
|
854 |
+
|
855 |
+
except TimedOut:
|
856 |
+
logger.warning(f"Timeout attempting to delete status/button message for user {user.id}, query {query.id}.")
|
857 |
+
except Exception as del_e:
|
858 |
+
# Log deletion errors as warnings, not critical if cleanup fails.
|
859 |
+
# Common error: message already deleted or trying to delete too late.
|
860 |
+
logger.warning(f"Could not delete status/button message for user {user.id}, query {query.id}: {del_e!r}")
|
861 |
+
|
862 |
+
# Log the completion of the callback handling
|
863 |
+
logger.info(f"Finished handling callback query {query.id} for user {user.id}. Overall Success: {success}")
|
864 |
+
|
865 |
|
866 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
867 |
"""Log Errors caused by Updates."""
|
868 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
869 |
+
# Add specific error type handling if needed (e.g., NetworkError, TimedOut)
|
870 |
+
if isinstance(context.error, TimedOut):
|
871 |
+
logger.warning("A timeout error occurred in PTB communication.")
|
872 |
+
elif isinstance(context.error, NetworkError):
|
873 |
+
logger.warning(f"A network error occurred: {context.error}")
|
874 |
+
# Consider notifying admin or user for specific critical errors if appropriate
|
875 |
|
876 |
+
# --- Bot Setup Function (Modified: Increased Pool/Timeouts) ---
|
877 |
async def setup_bot_config() -> Application:
|
878 |
"""Configures the PTB Application with custom HTTPX settings."""
|
879 |
logger.info("Configuring Telegram Application...")
|
|
|
881 |
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
|
882 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
883 |
|
884 |
+
# --- Configure HTTPX client settings ---
|
885 |
+
connect_timeout = 10.0 # Slightly higher connect timeout
|
886 |
+
# --- INCREASED TIMEOUTS AND POOL SIZE ---
|
887 |
+
read_timeout = 30.0 # Increased timeout for reading response
|
888 |
+
write_timeout = 30.0 # Increased timeout for sending request
|
889 |
+
pool_timeout = 30.0 # Increased timeout for getting connection from pool
|
890 |
+
connection_pool_size = 50 # Significantly increased pool size
|
891 |
|
892 |
+
logger.info(f"Creating PTB HTTPXRequest with settings: "
|
893 |
+
f"connect_timeout={connect_timeout}, read_timeout={read_timeout}, "
|
894 |
+
f"write_timeout={write_timeout}, pool_timeout={pool_timeout}, "
|
895 |
+
f"pool_size={connection_pool_size}")
|
896 |
+
|
897 |
+
# Create httpx.Limits object
|
898 |
+
custom_limits = httpx.Limits(
|
899 |
+
max_connections=connection_pool_size,
|
900 |
+
max_keepalive_connections=connection_pool_size # Keepalive same as max
|
901 |
+
# keepalive_expiry=60.0 # Optional: Keep idle connections open longer (seconds)
|
902 |
+
)
|
903 |
|
904 |
# Create a custom request object with these settings
|
|
|
905 |
custom_request = HTTPXRequest(
|
906 |
connect_timeout=connect_timeout,
|
907 |
read_timeout=read_timeout,
|
908 |
+
write_timeout=write_timeout,
|
909 |
pool_timeout=pool_timeout,
|
910 |
+
limits=custom_limits, # Use the Limits object here
|
911 |
+
http_version="1.1" # HTTP/1.1 is usually fine, HTTP/2 might be slightly faster if supported end-to-end
|
912 |
)
|
913 |
|
914 |
# Use Application.builder() and pass the custom request object
|
915 |
application_builder = Application.builder().token(TELEGRAM_TOKEN)
|
916 |
application_builder.request(custom_request)
|
917 |
+
# Also apply to get_updates if you were using polling (webhook doesn't use this heavily)
|
918 |
+
# application_builder.get_updates_request(custom_request)
|
919 |
+
# Apply connection pool settings globally if needed (less common now with direct request object)
|
920 |
+
# application_builder.pool_timeout(pool_timeout) # This might be redundant if set in HTTPXRequest
|
921 |
|
922 |
# Build the application instance
|
923 |
application = application_builder.build()
|
924 |
|
925 |
+
# --- Register Handlers ---
|
926 |
application.add_handler(CommandHandler("start", start))
|
927 |
application.add_handler(CommandHandler("help", help_command))
|
928 |
+
# Handles non-command text messages that might contain a URL
|
929 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
930 |
+
# Handles the button clicks ('paragraph' or 'points')
|
931 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
932 |
+
# Global error handler
|
933 |
application.add_error_handler(error_handler)
|
934 |
|
935 |
+
logger.info("Telegram application handlers configured.")
|
936 |
return application
|
937 |
|
938 |
# --- ASGI Lifespan Context Manager ---
|
|
|
940 |
async def lifespan(app: Starlette):
|
941 |
"""Handles PTB startup and shutdown during ASGI lifespan."""
|
942 |
global ptb_app
|
943 |
+
logger.info("ASGI Lifespan: Startup sequence initiated...")
|
944 |
+
# loop = asyncio.get_running_loop() # Not usually needed directly
|
945 |
|
946 |
try:
|
947 |
+
# --- Setup and Initialize PTB Application ---
|
948 |
ptb_app = await setup_bot_config()
|
949 |
+
logger.info("PTB Application object configured. Initializing...")
|
950 |
+
await ptb_app.initialize() # Initialize application components (e.g., bot instance)
|
951 |
+
logger.info("PTB Application initialized. Starting background tasks (e.g., job queue)...")
|
952 |
+
# Start PTB's internal tasks but not polling (we use webhook)
|
953 |
await ptb_app.start()
|
954 |
+
if ptb_app.updater: ptb_app.updater.stop() # Ensure polling is stopped if accidentally started
|
955 |
+
bot_instance = ptb_app.bot
|
956 |
+
bot_info = await bot_instance.get_me()
|
957 |
+
logger.info(f"PTB Application started successfully. Bot ID: {bot_info.id}, Username: @{bot_info.username}")
|
958 |
|
959 |
+
# --- Set Webhook ---
|
960 |
+
# Ensure SPACE_HOST is correctly set in Hugging Face Space secrets
|
961 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
962 |
if WEBHOOK_URL_BASE:
|
963 |
+
# Ensure it's a proper HTTPS URL
|
964 |
if not WEBHOOK_URL_BASE.startswith("https://"): WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
|
965 |
+
webhook_path = "/webhook" # Must match the route defined later
|
966 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
967 |
+
|
968 |
+
logger.info(f"Attempting to set Telegram webhook to: {full_webhook_url}")
|
969 |
+
# Short delay can sometimes help prevent race conditions on startup
|
970 |
+
await asyncio.sleep(2.0)
|
971 |
try:
|
972 |
+
# Set the webhook, specifying allowed updates can reduce load
|
973 |
+
await bot_instance.set_webhook(
|
974 |
+
url=full_webhook_url,
|
975 |
+
allowed_updates=Update.ALL_TYPES, # Or specify like [Update.MESSAGE, Update.CALLBACK_QUERY]
|
976 |
+
# secret_token="YOUR_SECRET_TOKEN" # Recommended for security if possible
|
977 |
+
# drop_pending_updates=True # Optional: Ignore updates sent while bot was down
|
978 |
+
)
|
979 |
+
# Verify webhook setup
|
980 |
+
webhook_info = await bot_instance.get_webhook_info()
|
981 |
+
if webhook_info.url == full_webhook_url:
|
982 |
+
logger.info(f"Telegram webhook set successfully! Current info: {webhook_info}")
|
983 |
+
else:
|
984 |
+
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got: {webhook_info.url}. Info: {webhook_info}")
|
985 |
except RetryAfter as e:
|
986 |
+
# This can happen if multiple workers try to set the webhook simultaneously
|
987 |
+
logger.warning(f"Webhook setting throttled by Telegram (RetryAfter: {e.retry_after}s). Another instance likely succeeded or try again later.")
|
988 |
+
# Optionally check info again after delay
|
989 |
+
await asyncio.sleep(e.retry_after or 2)
|
990 |
+
webhook_info = await bot_instance.get_webhook_info()
|
991 |
+
logger.info(f"Webhook info after RetryAfter delay: {webhook_info}")
|
992 |
except Exception as e:
|
993 |
+
logger.error(f"Failed to set Telegram webhook to {full_webhook_url}: {e}", exc_info=True)
|
994 |
+
else:
|
995 |
+
logger.warning("SPACE_HOST environment variable not found. Cannot set webhook automatically. Bot will not receive updates via webhook.")
|
996 |
|
997 |
+
logger.info("ASGI Lifespan: Startup complete. Application is ready to yield.")
|
998 |
+
yield # --- Application runs here ---
|
999 |
|
1000 |
except Exception as startup_err:
|
1001 |
+
logger.critical(f"CRITICAL ERROR during ASGI application startup: {startup_err}", exc_info=True)
|
1002 |
+
# Re-raise the exception to potentially stop the ASGI server from starting improperly
|
1003 |
raise
|
1004 |
finally:
|
1005 |
+
# --- Shutdown Sequence ---
|
1006 |
+
logger.info("ASGI Lifespan: Shutdown sequence initiated...")
|
1007 |
+
if ptb_app:
|
1008 |
+
bot_username = ptb_app.bot.username if ptb_app.bot else "N/A"
|
1009 |
+
logger.info(f"PTB App instance found for @{bot_username}. Checking if running...")
|
1010 |
+
# Check internal state if available (e.g., ptb_app.running might exist in future versions)
|
1011 |
+
# Using _running is internal, but often the only way
|
1012 |
+
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
1013 |
+
if is_running:
|
1014 |
+
try:
|
1015 |
+
logger.info("Stopping PTB Application's background tasks...")
|
1016 |
+
await ptb_app.stop() # Stop internal tasks like JobQueue
|
1017 |
+
logger.info("Shutting down PTB Application connections and resources...")
|
1018 |
+
await ptb_app.shutdown() # Clean up resources (e.g., close HTTPX client)
|
1019 |
+
logger.info("PTB Application shut down gracefully.")
|
1020 |
+
except Exception as shutdown_err:
|
1021 |
+
logger.error(f"Error during PTB Application shutdown: {shutdown_err}", exc_info=True)
|
1022 |
+
else:
|
1023 |
+
logger.warning("PTB Application instance exists but was not marked as running at shutdown.")
|
1024 |
+
# Attempt shutdown anyway just in case resources need cleaning
|
1025 |
+
try: await ptb_app.shutdown()
|
1026 |
+
except Exception: logger.error("Error during shutdown of non-running PTB app.", exc_info=True)
|
1027 |
+
else:
|
1028 |
+
logger.warning("No PTB Application instance (ptb_app) found during ASGI shutdown.")
|
1029 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1030 |
|
1031 |
|
1032 |
+
# --- Flask App Setup (for Webhook Route) ---
|
1033 |
+
# We use Flask just for its familiarity in defining the route,
|
1034 |
+
# but it runs within Starlette's ASGI context via WSGIMiddleware.
|
1035 |
flask_core_app = Flask(__name__)
|
1036 |
+
logger.info("Core Flask app instance created (used by Starlette for routing).")
|
1037 |
|
1038 |
+
# --- Define Flask Routes ---
|
1039 |
@flask_core_app.route('/')
|
1040 |
def index():
|
1041 |
"""Basic health check endpoint."""
|
1042 |
+
logger.debug("Health check endpoint '/' accessed.")
|
1043 |
+
bot_status = "Unknown / Not Initialized"
|
1044 |
+
if ptb_app and ptb_app.bot:
|
1045 |
+
# Check internal state again (might have changed)
|
1046 |
+
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
1047 |
+
bot_status = f"Running (@{ptb_app.bot.username})" if is_running else f"Initialized/Stopped (@{ptb_app.bot.username})"
|
1048 |
+
return f"Telegram Bot Summarizer - Status: {bot_status} - Listening via Starlette/Uvicorn."
|
1049 |
|
1050 |
@flask_core_app.route('/webhook', methods=['POST'])
|
1051 |
async def webhook() -> Response:
|
1052 |
+
"""Webhook endpoint called by Telegram."""
|
1053 |
+
global ptb_app # Ensure we're using the global instance initialized by lifespan
|
1054 |
+
|
1055 |
if not ptb_app:
|
1056 |
+
logger.error("Webhook triggered, but PTB Application instance (ptb_app) is None. Lifespan likely failed.")
|
1057 |
+
# Return 503 Service Unavailable
|
1058 |
+
return Response('Bot service is not configured or failed during startup.', status=503)
|
1059 |
+
|
1060 |
+
# Check internal state (safer than assuming ptb_app implies running)
|
1061 |
+
is_running = getattr(ptb_app, '_running', False) or getattr(ptb_app, 'running', False)
|
1062 |
+
if not is_running:
|
1063 |
+
logger.error("Webhook triggered, but PTB Application is not currently running.")
|
1064 |
+
# Return 503 Service Unavailable
|
1065 |
+
return Response('Bot service is initialized but not actively running.', status=503)
|
1066 |
+
|
1067 |
+
# Proceed with processing the update
|
1068 |
+
logger.debug("Webhook endpoint received POST request from Telegram.")
|
1069 |
+
try:
|
1070 |
+
# Use Flask's request object to get JSON data
|
1071 |
+
update_data = await request.get_json()
|
1072 |
+
if not update_data:
|
1073 |
+
logger.warning("Received empty or non-JSON data on webhook.")
|
1074 |
+
return Response('Bad Request: Expected JSON payload.', status=400)
|
1075 |
+
|
1076 |
+
# Deserialize JSON into a Telegram Update object
|
1077 |
+
update = Update.de_json(update_data, ptb_app.bot)
|
1078 |
+
logger.debug(f"Processing update_id: {update.update_id} via webhook route.")
|
1079 |
+
|
1080 |
+
# Process the update using PTB's internal mechanisms
|
1081 |
+
# This will dispatch it to the correct handler (CommandHandler, MessageHandler, etc.)
|
1082 |
+
await ptb_app.process_update(update)
|
1083 |
+
|
1084 |
+
logger.debug(f"Finished processing update_id: {update.update_id}")
|
1085 |
+
# Return 200 OK to Telegram to acknowledge receipt
|
1086 |
+
return Response('ok', status=200)
|
1087 |
+
|
1088 |
+
except json.JSONDecodeError:
|
1089 |
+
logger.error("Failed to decode JSON from Telegram webhook request.", exc_info=True)
|
1090 |
+
return Response('Bad Request: Invalid JSON format.', status=400)
|
1091 |
+
except Exception as e:
|
1092 |
+
# Catch potential errors during Update.de_json or ptb_app.process_update
|
1093 |
+
logger.error(f"Error processing update in webhook handler: {e}", exc_info=True)
|
1094 |
+
# Return 500 Internal Server Error to Telegram
|
1095 |
+
# Telegram will likely retry sending the update later
|
1096 |
+
return Response('Internal Server Error processing update.', status=500)
|
1097 |
+
|
1098 |
+
|
1099 |
+
# --- Create Starlette ASGI Application ---
|
1100 |
+
# This is the main application object that Uvicorn/Gunicorn will run.
|
1101 |
app = Starlette(
|
1102 |
+
debug=False, # Set debug based on environment if needed, but generally False in prod
|
1103 |
+
lifespan=lifespan, # Hook into the lifespan context manager for startup/shutdown
|
1104 |
routes=[
|
1105 |
+
# Mount the Flask app under the root path. Starlette handles requests
|
1106 |
+
# and forwards relevant ones ('/') and ('/webhook') to the Flask app.
|
1107 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
1108 |
]
|
1109 |
)
|
1110 |
+
logger.info("Starlette ASGI application created, configured with lifespan and Flask app mounted at '/'.")
|
1111 |
|
1112 |
|
1113 |
+
# --- Development Server Execution Block ---
|
1114 |
+
# This block is ONLY for running the Flask app directly for basic testing
|
1115 |
+
# WITHOUT the proper ASGI lifespan management (PTB won't start correctly here).
|
1116 |
+
# DO NOT use this for deployment. Use `gunicorn main:app` or `uvicorn main:app`.
|
1117 |
if __name__ == '__main__':
|
1118 |
+
logger.warning("=" * 50)
|
1119 |
+
logger.warning(" RUNNING SCRIPT DIRECTLY (using __main__) ".center(50, "="))
|
1120 |
+
logger.warning("=" * 50)
|
1121 |
+
logger.warning("This mode starts the Flask development server.")
|
1122 |
+
logger.warning("!!! IT DOES **NOT** RUN THE ASGI LIFESPAN !!!")
|
1123 |
+
logger.warning("!!! The Telegram Bot (PTB Application) WILL NOT INITIALIZE OR RUN !!!")
|
1124 |
+
logger.warning("This is suitable ONLY for verifying Flask routes locally.")
|
1125 |
+
logger.warning("For proper testing/deployment, use: uvicorn main:app --reload --port 8080")
|
1126 |
+
logger.warning("or via Gunicorn: gunicorn -c gunicorn.conf.py main:app")
|
1127 |
+
logger.warning("=" * 50)
|
1128 |
+
|
1129 |
+
if not TELEGRAM_TOKEN:
|
1130 |
+
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable missing. Aborting direct Flask start.")
|
1131 |
else:
|
1132 |
+
# Get port from environment or default to 8080 for local dev
|
1133 |
local_port = int(os.environ.get('PORT', 8080))
|
1134 |
+
logger.info(f"Starting Flask development server on http://0.0.0.0:{local_port}")
|
1135 |
+
# Run the Flask app directly (no Starlette, no lifespan, no PTB)
|
1136 |
+
# use_reloader=False is important if debugging PTB setup elsewhere
|
1137 |
+
flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
|