fmab777 commited on
Commit
d29aaf9
·
verified ·
1 Parent(s): a59041f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +51 -79
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Updated for Specific April 2025 Models: Llama 4 Scout & DeepSeek V3 Free)
2
  import os
3
  import re
4
  import logging
@@ -99,16 +99,16 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
99
  RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
100
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
101
 
102
- # <<< CHANGE: Using EXACT Model Identifiers from User Docs >>>
103
  # Model Priority:
104
  # 1. Groq Llama 4 Scout
105
  # 2. Gemini 2.5 Pro Exp
106
  # 3. Gemini 2.0 Flash
107
  # 4. OpenRouter DeepSeek V3 Free
108
- GROQ_LLAMA4_MODEL = os.environ.get("GROQ_LLAMA4_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct") # <<< Specific Llama 4 model
109
  GEMINI_PRO_EXP_MODEL = os.environ.get("GEMINI_PRO_EXP_MODEL", "gemini-2.5-pro-exp-03-25")
110
  GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001")
111
- OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # <<< Specific DeepSeek model
112
 
113
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # YT Default
114
  APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Scrape Fallback 4
@@ -184,14 +184,9 @@ def extract_youtube_id(url):
184
  if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
185
  else: logger.warning(f"Could not extract YT ID from {url}"); return None
186
 
 
187
  # --- Content Fetching Functions ---
188
- # (These functions: get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript,
189
- # fetch_url_content_for_scrape, get_website_content, get_website_content_via_api,
190
- # get_website_content_via_scrapers_proxy, get_website_content_via_ai_web_scraper,
191
- # _run_apify_actor_for_web_content, get_website_content_via_apify_crawler,
192
- # get_website_content_via_apify_text_scraper remain UNCHANGED. They are omitted here for brevity
193
- # but MUST be included in the final main.py file)
194
- # --- START OMITTED CONTENT FETCHING FUNCTIONS ---
195
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
196
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
197
  if not api_key: logger.error("[Supadata] API key missing."); return None
@@ -204,10 +199,10 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
204
  logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
205
  if response.status_code == 200:
206
  try:
207
- data = response.json() if response.text else None # Check if text exists before json decode
208
  content = None
209
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
210
- if not content and response.text: content = response.text # Fallback to raw text if json parse fails or content key missing
211
  if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
212
  else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
213
  except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
@@ -294,6 +289,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
294
  return transcript_text
295
 
296
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
 
297
  headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
298
  try:
299
  async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
@@ -313,6 +309,7 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
313
  return None
314
 
315
  async def get_website_content(url: str) -> Optional[str]:
 
316
  if not url: logger.error("[Web Scrape Primary] No URL provided"); return None
317
  logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}")
318
  html_content = await fetch_url_content_for_scrape(url)
@@ -336,6 +333,7 @@ async def get_website_content(url: str) -> Optional[str]:
336
  except Exception as e: logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True); return None
337
 
338
  async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
 
339
  if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None
340
  if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None
341
  logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API")
@@ -363,6 +361,7 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
363
  except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
364
 
365
  async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
 
366
  if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
367
  if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
368
  logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
@@ -396,6 +395,7 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
396
  except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
397
 
398
  async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
 
399
  if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
400
  if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
401
  logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API")
@@ -432,6 +432,7 @@ async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Opti
432
  except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
433
 
434
  async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]:
 
435
  if not url: logger.error(f"[{actor_name}] No URL provided"); return None
436
  if not api_token: logger.error(f"[{actor_name}] API token missing."); return None
437
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
@@ -474,11 +475,12 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
474
  except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
475
 
476
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
 
477
  return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, actor_name="Apify Crawler" )
478
 
479
  async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]:
 
480
  return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, actor_name="Apify Text Scraper" )
481
- # --- END OMITTED CONTENT FETCHING FUNCTIONS ---
482
 
483
 
484
  # --- Summarization Functions (Using Specific April 2025 Models) ---
@@ -516,7 +518,6 @@ PROMPT_POINTS = (
516
  "Here is the text to summarise:"
517
  )
518
 
519
- # <<< Uses the specific GROQ_LLAMA4_MODEL constant >>>
520
  async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
521
  """Internal function to call Groq API (Primary - Llama 4 Scout). Returns (summary, error_message)."""
522
  global GROQ_API_KEY, GROQ_LLAMA4_MODEL, _groq_enabled
@@ -526,9 +527,7 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
526
  logger.info(f"[Groq Primary] Generating {summary_type} summary using {GROQ_LLAMA4_MODEL}. Input length: {len(text)}")
527
 
528
  prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
529
-
530
- # Input Length Check for Llama 4 Scout (16k context? Be conservative)
531
- MAX_INPUT_LENGTH_GROQ = 40000 # ~13k tokens
532
  if len(text) > MAX_INPUT_LENGTH_GROQ:
533
  logger.warning(f"[Groq Primary] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GROQ}). Truncating.");
534
  text = text[:MAX_INPUT_LENGTH_GROQ] + "... (Content truncated)"
@@ -537,18 +536,14 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
537
  try:
538
  groq_client = Groq( api_key=GROQ_API_KEY, timeout=httpx.Timeout(120.0, connect=10.0) )
539
  logger.info(f"[Groq Primary] Sending request to Groq ({GROQ_LLAMA4_MODEL})...")
540
-
541
  chat_completion = await groq_client.chat.completions.create(
542
  messages=[ { "role": "user", "content": full_prompt } ],
543
- model=GROQ_LLAMA4_MODEL, # <<< Use specific Llama 4 model name
544
- temperature=0.7, # <<< Groq default is 1, adjust if needed
545
- max_tokens=2048, # <<< Groq default is 1024, adjust if needed for longer summaries
546
- top_p=1,
547
- stream=False,
548
- stop=None,
549
  )
550
  logger.info("[Groq Primary] Received response from Groq.")
551
-
552
  if chat_completion.choices and chat_completion.choices[0].message and chat_completion.choices[0].message.content:
553
  summary = chat_completion.choices[0].message.content
554
  finish_reason = chat_completion.choices[0].finish_reason
@@ -558,7 +553,6 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
558
  logger.warning(f"[Groq Primary] Groq response structure unexpected or content empty. Response: {chat_completion.model_dump_json(indent=2)}")
559
  finish_reason = chat_completion.choices[0].finish_reason if chat_completion.choices else 'N/A'
560
  return None, f"Sorry, the primary AI model ({GROQ_LLAMA4_MODEL}) provided an empty or invalid response (Finish Reason: {finish_reason})."
561
-
562
  except GroqError as ge:
563
  logger.error(f"[Groq Primary] Groq API error: {ge.status_code} - {ge.message}", exc_info=False)
564
  error_msg = f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) failed. API Error: {ge.status_code}."
@@ -575,7 +569,6 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
575
  logger.error(f"[Groq Primary] Unexpected error during Groq API call: {e}", exc_info=True);
576
  return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GROQ_LLAMA4_MODEL})."
577
 
578
-
579
  async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[Optional[str], Optional[str]]:
580
  """Internal function to call Gemini API. Returns (summary, error_message)."""
581
  global _gemini_api_enabled
@@ -639,7 +632,6 @@ async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[O
639
  error_msg = f"Sorry, an unexpected error occurred while using the AI service ({model_name})."
640
  return None, error_msg
641
 
642
- # <<< Uses the specific OPENROUTER_DEEPSEEK_MODEL constant >>>
643
  async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
644
  """Internal function to call OpenRouter API (Final Fallback - DeepSeek V3 Free). Returns (summary, error_message)."""
645
  global OPENROUTER_API_KEY, OPENROUTER_DEEPSEEK_MODEL, _openrouter_fallback_enabled
@@ -655,17 +647,13 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
655
  text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
656
  full_prompt = f"{prompt}\n\n{text}"
657
 
658
- # Use the direct httpx call as before, ensuring the correct model name is in the payload
659
  headers = {
660
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
661
  "Content-Type": "application/json",
662
- "HTTP-Referer": os.environ.get("YOUR_SITE_URL", "https://github.com/your-repo"), # Optional header
663
- "X-Title": os.environ.get("YOUR_SITE_NAME", "TelegramSummariserBot") # Optional header
664
- }
665
- payload = {
666
- "model": OPENROUTER_DEEPSEEK_MODEL, # <<< Use specific DeepSeek model name
667
- "messages": [{"role": "user", "content": full_prompt}],
668
  }
 
669
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
670
  api_timeouts = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=60.0)
671
  response = None
@@ -675,7 +663,6 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
675
  logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}) with read timeout {api_timeouts.read}s...")
676
  response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
677
  logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}")
678
-
679
  if response.status_code == 200:
680
  try:
681
  data = response.json()
@@ -701,12 +688,10 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
701
  except Exception: pass
702
  logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}");
703
  return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) returned unexpected status ({response.status_code})."
704
-
705
  except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}"); return None, f"Sorry, the fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) timed out."
706
  except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Sorry, there was an error connecting to the fallback AI model service."
707
  except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Sorry, an unexpected error occurred while using the fallback AI service."
708
 
709
-
710
  async def generate_summary(text: str, summary_type: str) -> str:
711
  """
712
  Generates summary using the specific model hierarchy (April 2025):
@@ -722,7 +707,7 @@ async def generate_summary(text: str, summary_type: str) -> str:
722
  logger.info("[Summary Generation] Starting process with specific April 2025 model hierarchy.")
723
  summary: Optional[str] = None
724
  errors: Dict[str, Optional[str]] = {
725
- "Llama4Scout": None, # <<< Use more descriptive keys
726
  "GeminiProExp": None,
727
  "GeminiFlash": None,
728
  "DeepSeekV3": None,
@@ -788,9 +773,6 @@ async def generate_summary(text: str, summary_type: str) -> str:
788
 
789
 
790
  # --- Main Processing Logic ---
791
- # (process_summary_task remains UNCHANGED in its core logic, it correctly calls the updated generate_summary.
792
- # Omitted here for brevity, but MUST be included in the final file.)
793
- # --- START OMITTED process_summary_task ---
794
  async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
795
  """Handles the entire process: fetching content (with ALL fallbacks) and summarizing."""
796
  task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
@@ -903,13 +885,9 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
903
  try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
904
  except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
905
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
906
- # --- END OMITTED process_summary_task ---
907
 
908
 
909
  # --- Telegram Handlers ---
910
- # (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler
911
- # remain UNCHANGED. Omitted here for brevity, but include in final file.)
912
- # --- START OMITTED TELEGRAM HANDLERS ---
913
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
914
  user = update.effective_user; mention = user.mention_html()
915
  if not user or not update.message: return
@@ -958,7 +936,6 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
958
  else:
959
  logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}")
960
 
961
-
962
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
963
  query = update.callback_query
964
  if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
@@ -1016,14 +993,9 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
1016
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
1017
  """Log Errors caused by Updates."""
1018
  logger.error("Exception while handling an update:", exc_info=context.error)
1019
- # --- END OMITTED TELEGRAM HANDLERS ---
1020
 
1021
 
1022
  # --- Application Setup & Web Framework ---
1023
- # (setup_bot_config, lifespan, telegram_webhook, app definition
1024
- # remain UNCHANGED. health_check is modified below.
1025
- # Omitted here for brevity, include in final file.)
1026
- # --- START OMITTED APP SETUP/WEB FRAMEWORK (excluding health_check) ---
1027
  async def setup_bot_config() -> Application:
1028
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
1029
  if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
@@ -1093,32 +1065,6 @@ async def lifespan(app: Starlette):
1093
  else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.")
1094
  logger.info("ASGI Lifespan: Shutdown complete.")
1095
 
1096
- async def telegram_webhook(request: Request) -> Response:
1097
- """Handles incoming updates from Telegram."""
1098
- global WEBHOOK_SECRET
1099
- if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
1100
- if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503)
1101
- if WEBHOOK_SECRET:
1102
- token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
1103
- if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
1104
- try:
1105
- update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot)
1106
- logger.debug(f"Processing update_id: {update.update_id} via webhook"); await ptb_app.process_update(update)
1107
- return Response(status_code=200)
1108
- except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
1109
- except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200)
1110
-
1111
- # --- Starlette App Definition ---
1112
- # Note: health_check is defined below
1113
- app = Starlette(
1114
- debug=False,
1115
- lifespan=lifespan,
1116
- routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ]
1117
- )
1118
- logger.info("Starlette ASGI application created with health check and webhook routes.")
1119
- # --- END OMITTED APP SETUP/WEB FRAMEWORK ---
1120
-
1121
- # <<< CHANGE: Updated health check response for specific models >>>
1122
  async def health_check(request: Request) -> PlainTextResponse:
1123
  """Simple health check endpoint."""
1124
  global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL, APIFY_ACTOR_ID
@@ -1158,6 +1104,32 @@ async def health_check(request: Request) -> PlainTextResponse:
1158
  f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
1159
  )
1160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161
  # --- Development Server (if run directly) ---
1162
  if __name__ == '__main__':
1163
  import uvicorn
 
1
+ # main.py (Full Code - Specific April 2025 Models: Llama 4 Scout & DeepSeek V3 Free)
2
  import os
3
  import re
4
  import logging
 
99
  RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
100
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
101
 
102
+ # --- Model Configurations (Specific April 2025) ---
103
  # Model Priority:
104
  # 1. Groq Llama 4 Scout
105
  # 2. Gemini 2.5 Pro Exp
106
  # 3. Gemini 2.0 Flash
107
  # 4. OpenRouter DeepSeek V3 Free
108
+ GROQ_LLAMA4_MODEL = os.environ.get("GROQ_LLAMA4_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct") # Specific Llama 4 model
109
  GEMINI_PRO_EXP_MODEL = os.environ.get("GEMINI_PRO_EXP_MODEL", "gemini-2.5-pro-exp-03-25")
110
  GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001")
111
+ OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Specific DeepSeek model
112
 
113
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # YT Default
114
  APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Scrape Fallback 4
 
184
  if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
185
  else: logger.warning(f"Could not extract YT ID from {url}"); return None
186
 
187
+
188
  # --- Content Fetching Functions ---
189
+
 
 
 
 
 
 
190
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
191
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
192
  if not api_key: logger.error("[Supadata] API key missing."); return None
 
199
  logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
200
  if response.status_code == 200:
201
  try:
202
+ data = response.json() if response.text else None
203
  content = None
204
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
205
+ if not content and response.text: content = response.text
206
  if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
207
  else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
208
  except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
 
289
  return transcript_text
290
 
291
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
292
+ """Directly fetches URL content using httpx."""
293
  headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
294
  try:
295
  async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
 
309
  return None
310
 
311
  async def get_website_content(url: str) -> Optional[str]:
312
+ """Primary method: Fetches HTML directly and parses with BeautifulSoup."""
313
  if not url: logger.error("[Web Scrape Primary] No URL provided"); return None
314
  logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}")
315
  html_content = await fetch_url_content_for_scrape(url)
 
333
  except Exception as e: logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True); return None
334
 
335
  async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
336
+ """Fallback 1: Fetches website content using urltotext.com API."""
337
  if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None
338
  if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None
339
  logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API")
 
361
  except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
362
 
363
  async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
364
+ """Fallback 2: Fetches website content using Scraper's Proxy Parser via RapidAPI."""
365
  if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
366
  if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
367
  logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
 
395
  except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
396
 
397
  async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
398
+ """Fallback 3: Fetches website content using AI Web Scraper via RapidAPI."""
399
  if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
400
  if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
401
  logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API")
 
432
  except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
433
 
434
  async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]:
435
+ """Generic function to run an Apify actor and get text content."""
436
  if not url: logger.error(f"[{actor_name}] No URL provided"); return None
437
  if not api_token: logger.error(f"[{actor_name}] API token missing."); return None
438
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
 
475
  except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
476
 
477
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
478
+ """Fallback 4: Fetches website content using Apify Website Content Crawler."""
479
  return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, actor_name="Apify Crawler" )
480
 
481
  async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]:
482
+ """Fallback 5: Fetches website content using Apify Text Scraper Free."""
483
  return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, actor_name="Apify Text Scraper" )
 
484
 
485
 
486
  # --- Summarization Functions (Using Specific April 2025 Models) ---
 
518
  "Here is the text to summarise:"
519
  )
520
 
 
521
  async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
522
  """Internal function to call Groq API (Primary - Llama 4 Scout). Returns (summary, error_message)."""
523
  global GROQ_API_KEY, GROQ_LLAMA4_MODEL, _groq_enabled
 
527
  logger.info(f"[Groq Primary] Generating {summary_type} summary using {GROQ_LLAMA4_MODEL}. Input length: {len(text)}")
528
 
529
  prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
530
+ MAX_INPUT_LENGTH_GROQ = 40000 # ~13k tokens for 16k context
 
 
531
  if len(text) > MAX_INPUT_LENGTH_GROQ:
532
  logger.warning(f"[Groq Primary] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GROQ}). Truncating.");
533
  text = text[:MAX_INPUT_LENGTH_GROQ] + "... (Content truncated)"
 
536
  try:
537
  groq_client = Groq( api_key=GROQ_API_KEY, timeout=httpx.Timeout(120.0, connect=10.0) )
538
  logger.info(f"[Groq Primary] Sending request to Groq ({GROQ_LLAMA4_MODEL})...")
 
539
  chat_completion = await groq_client.chat.completions.create(
540
  messages=[ { "role": "user", "content": full_prompt } ],
541
+ model=GROQ_LLAMA4_MODEL,
542
+ temperature=0.7, # Adjust from Groq default of 1 if needed
543
+ max_tokens=2048, # Adjust from Groq default of 1024 if needed
544
+ top_p=1, stream=False, stop=None,
 
 
545
  )
546
  logger.info("[Groq Primary] Received response from Groq.")
 
547
  if chat_completion.choices and chat_completion.choices[0].message and chat_completion.choices[0].message.content:
548
  summary = chat_completion.choices[0].message.content
549
  finish_reason = chat_completion.choices[0].finish_reason
 
553
  logger.warning(f"[Groq Primary] Groq response structure unexpected or content empty. Response: {chat_completion.model_dump_json(indent=2)}")
554
  finish_reason = chat_completion.choices[0].finish_reason if chat_completion.choices else 'N/A'
555
  return None, f"Sorry, the primary AI model ({GROQ_LLAMA4_MODEL}) provided an empty or invalid response (Finish Reason: {finish_reason})."
 
556
  except GroqError as ge:
557
  logger.error(f"[Groq Primary] Groq API error: {ge.status_code} - {ge.message}", exc_info=False)
558
  error_msg = f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) failed. API Error: {ge.status_code}."
 
569
  logger.error(f"[Groq Primary] Unexpected error during Groq API call: {e}", exc_info=True);
570
  return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GROQ_LLAMA4_MODEL})."
571
 
 
572
  async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[Optional[str], Optional[str]]:
573
  """Internal function to call Gemini API. Returns (summary, error_message)."""
574
  global _gemini_api_enabled
 
632
  error_msg = f"Sorry, an unexpected error occurred while using the AI service ({model_name})."
633
  return None, error_msg
634
 
 
635
  async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
636
  """Internal function to call OpenRouter API (Final Fallback - DeepSeek V3 Free). Returns (summary, error_message)."""
637
  global OPENROUTER_API_KEY, OPENROUTER_DEEPSEEK_MODEL, _openrouter_fallback_enabled
 
647
  text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
648
  full_prompt = f"{prompt}\n\n{text}"
649
 
 
650
  headers = {
651
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
652
  "Content-Type": "application/json",
653
+ "HTTP-Referer": os.environ.get("YOUR_SITE_URL", "https://github.com/your-repo"),
654
+ "X-Title": os.environ.get("YOUR_SITE_NAME", "TelegramSummariserBot")
 
 
 
 
655
  }
656
+ payload = { "model": OPENROUTER_DEEPSEEK_MODEL, "messages": [{"role": "user", "content": full_prompt}], }
657
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
658
  api_timeouts = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=60.0)
659
  response = None
 
663
  logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}) with read timeout {api_timeouts.read}s...")
664
  response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
665
  logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}")
 
666
  if response.status_code == 200:
667
  try:
668
  data = response.json()
 
688
  except Exception: pass
689
  logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}");
690
  return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) returned unexpected status ({response.status_code})."
 
691
  except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}"); return None, f"Sorry, the fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) timed out."
692
  except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Sorry, there was an error connecting to the fallback AI model service."
693
  except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Sorry, an unexpected error occurred while using the fallback AI service."
694
 
 
695
  async def generate_summary(text: str, summary_type: str) -> str:
696
  """
697
  Generates summary using the specific model hierarchy (April 2025):
 
707
  logger.info("[Summary Generation] Starting process with specific April 2025 model hierarchy.")
708
  summary: Optional[str] = None
709
  errors: Dict[str, Optional[str]] = {
710
+ "Llama4Scout": None,
711
  "GeminiProExp": None,
712
  "GeminiFlash": None,
713
  "DeepSeekV3": None,
 
773
 
774
 
775
  # --- Main Processing Logic ---
 
 
 
776
  async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
777
  """Handles the entire process: fetching content (with ALL fallbacks) and summarizing."""
778
  task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
 
885
  try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
886
  except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
887
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
 
888
 
889
 
890
  # --- Telegram Handlers ---
 
 
 
891
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
892
  user = update.effective_user; mention = user.mention_html()
893
  if not user or not update.message: return
 
936
  else:
937
  logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}")
938
 
 
939
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
940
  query = update.callback_query
941
  if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
 
993
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
994
  """Log Errors caused by Updates."""
995
  logger.error("Exception while handling an update:", exc_info=context.error)
 
996
 
997
 
998
  # --- Application Setup & Web Framework ---
 
 
 
 
999
  async def setup_bot_config() -> Application:
1000
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
1001
  if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
 
1065
  else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.")
1066
  logger.info("ASGI Lifespan: Shutdown complete.")
1067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1068
  async def health_check(request: Request) -> PlainTextResponse:
1069
  """Simple health check endpoint."""
1070
  global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL, APIFY_ACTOR_ID
 
1104
  f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
1105
  )
1106
 
1107
+ async def telegram_webhook(request: Request) -> Response:
1108
+ """Handles incoming updates from Telegram."""
1109
+ global WEBHOOK_SECRET
1110
+ if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
1111
+ if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503)
1112
+ if WEBHOOK_SECRET:
1113
+ token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
1114
+ if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
1115
+ try:
1116
+ update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot)
1117
+ logger.debug(f"Processing update_id: {update.update_id} via webhook"); await ptb_app.process_update(update)
1118
+ return Response(status_code=200)
1119
+ except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
1120
+ except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200)
1121
+
1122
+ # --- Starlette App Definition ---
1123
+ app = Starlette(
1124
+ debug=False, # Keep False for production/Hugging Face
1125
+ lifespan=lifespan,
1126
+ routes=[
1127
+ Route("/", endpoint=health_check, methods=["GET"]),
1128
+ Route("/webhook", endpoint=telegram_webhook, methods=["POST"]),
1129
+ ]
1130
+ )
1131
+ logger.info("Starlette ASGI application created with health check and webhook routes.")
1132
+
1133
  # --- Development Server (if run directly) ---
1134
  if __name__ == '__main__':
1135
  import uvicorn