fmab777 commited on
Commit
899ace9
·
verified ·
1 Parent(s): 16d7cee

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +107 -24
main.py CHANGED
@@ -111,6 +111,7 @@ GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001"
111
  OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Specific DeepSeek model
112
 
113
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "1s7eXiaukVuOr4Ueg") # YT Default
 
114
  APIFY_CRAWLER_ACTOR_ID = "aYG0l9s7dbB7j3gbS" # Scrape Fallback 4
115
  APIFY_TEXT_SCRAPER_ACTOR_ID = "2gbQiRSpJIIag2FdR" # Scrape Fallback 5
116
 
@@ -147,9 +148,12 @@ logger.info(f"Summarizer 1 (Groq): {GROQ_LLAMA4_MODEL if _groq_enabled else 'DIS
147
  logger.info(f"Summarizer 2 (Gemini Pro Exp): {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}")
148
  logger.info(f"Summarizer 3 (Gemini Flash): {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}")
149
  logger.info(f"Summarizer 4 (OpenRouter): {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
150
- logger.info(f"Using Apify Actor (YT Default): {APIFY_ACTOR_ID}")
 
 
151
  logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}")
152
  logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}")
 
153
 
154
  # Flags for scraper key existence
155
  _apify_token_exists = bool(APIFY_API_TOKEN)
@@ -275,35 +279,107 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
275
  logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True)
276
  return None
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
279
- global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
 
280
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
 
281
  logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
282
- transcript_text = None
 
 
283
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
284
  try:
285
- transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
286
- if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
287
- if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text
288
- else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
289
- except NoTranscriptFound: logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
290
- except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
291
- except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  if transcript_text is None:
293
- logger.info("[Fallback YT 1] Trying Supadata API...")
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  if SUPADATA_API_KEY:
295
  transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
296
- if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
297
- else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
298
- else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
 
 
 
 
 
 
299
  if transcript_text is None:
300
- logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
301
  if _apify_token_exists:
 
302
  transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
303
- if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
304
- else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
305
- else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
306
- if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
 
 
 
 
 
 
 
 
 
 
 
307
  return transcript_text
308
 
309
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
@@ -1274,10 +1350,12 @@ async def lifespan(app: Starlette):
1274
 
1275
  async def health_check(request: Request) -> PlainTextResponse:
1276
  """Simple health check endpoint."""
1277
- global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL, APIFY_ACTOR_ID
 
1278
  global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
1279
  global _apify_token_exists, _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY
1280
 
 
1281
  bot_status = "Not Initialized"; bot_username = "N/A"
1282
  if ptb_app:
1283
  try:
@@ -1292,7 +1370,8 @@ async def health_check(request: Request) -> PlainTextResponse:
1292
  bot_status = f"Error checking status: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
1293
  else: bot_status = "Not Initialized"; bot_username = "N/A"
1294
 
1295
- # <<< Update response string with specific model names >>>
 
1296
  return PlainTextResponse(
1297
  f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
1298
  f"---\n"
@@ -1303,12 +1382,16 @@ async def health_check(request: Request) -> PlainTextResponse:
1303
  f"4. OpenRouter API: {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n"
1304
  f"---\n"
1305
  f"Content Fetching Status:\n"
1306
- f"YT Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
1307
- f"YT Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
 
 
 
 
1308
  f"Web Scrape 1 (Direct+BS4): Enabled\n"
1309
  f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
1310
  f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
1311
- f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
1312
  )
1313
 
1314
  async def telegram_webhook(request: Request) -> Response:
 
111
  OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Specific DeepSeek model
112
 
113
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "1s7eXiaukVuOr4Ueg") # YT Default
114
+ APIFY_STRUCTURED_YT_ACTOR_ID = "gpjTCWkGZS1lHc9pR" # YT Fallback 1 (New Structured Extractor)
115
  APIFY_CRAWLER_ACTOR_ID = "aYG0l9s7dbB7j3gbS" # Scrape Fallback 4
116
  APIFY_TEXT_SCRAPER_ACTOR_ID = "2gbQiRSpJIIag2FdR" # Scrape Fallback 5
117
 
 
148
  logger.info(f"Summarizer 2 (Gemini Pro Exp): {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}")
149
  logger.info(f"Summarizer 3 (Gemini Flash): {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}")
150
  logger.info(f"Summarizer 4 (OpenRouter): {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
151
+ # --- Updated Logging ---
152
+ logger.info(f"Using Apify Actor (YT Fallback 1 - Structured): {APIFY_STRUCTURED_YT_ACTOR_ID}")
153
+ logger.info(f"Using Apify Actor (YT Fallback 3 - Default): {APIFY_ACTOR_ID}") # Now fallback 3
154
  logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}")
155
  logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}")
156
+ # --- End Updated Logging ---
157
 
158
  # Flags for scraper key existence
159
  _apify_token_exists = bool(APIFY_API_TOKEN)
 
279
  logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True)
280
  return None
281
 
282
+ async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
283
+ """Fallback YT 1: Fetches YouTube transcript using the Structured Extractor Apify Actor."""
284
+ global APIFY_STRUCTURED_YT_ACTOR_ID # Use the new ID
285
+ if not video_url: logger.error("[Apify Structured YT] No video_url provided"); return None
286
+ if not api_token: logger.error("[Apify Structured YT] API token missing."); return None
287
+ logger.info(f"[YT Fallback 1] Attempting fetch for URL: {video_url} (Actor: {APIFY_STRUCTURED_YT_ACTOR_ID})")
288
+
289
+ # Use the generic helper function.
290
+ # We assume the standard input format used by the helper for non-specific actors
291
+ # (like the text scraper: {"urls": [url]}) is sufficient, or that the helper's
292
+ # existing logic correctly identifies and parses the output from this actor.
293
+ # The helper already has logic to parse various text/content/captions fields,
294
+ # including list structures for captions as described in the actor's docs.
295
+ return await _run_apify_actor_for_web_content(
296
+ url=video_url,
297
+ api_token=api_token,
298
+ actor_id=APIFY_STRUCTURED_YT_ACTOR_ID,
299
+ actor_name="Apify Structured YT" # Specific name for logging
300
+ )
301
+
302
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
303
+ """Fetches YouTube transcript using a primary library and multiple fallback methods."""
304
+ global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists # Keep existing globals
305
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
306
+
307
  logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
308
+ transcript_text: Optional[str] = None
309
+
310
+ # --- Primary Method: youtube-transcript-api ---
311
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
312
  try:
313
+ # Prefer English variants first
314
+ transcript_list = await asyncio.to_thread(
315
+ YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US']
316
+ )
317
+ if transcript_list:
318
+ transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
319
+ if transcript_text:
320
+ logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})")
321
+ return transcript_text.strip() # Return immediately on success
322
+ else:
323
+ logger.warning(f"[Primary YT] Transcript list returned but text content was empty for {video_id}")
324
+ transcript_text = None # Ensure it's None to trigger fallbacks
325
+ except NoTranscriptFound:
326
+ logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
327
+ transcript_text = None
328
+ except TranscriptsDisabled:
329
+ logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
330
+ transcript_text = None
331
+ except Exception as e:
332
+ # Log more specific errors if possible, e.g., timeouts, network issues
333
+ logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
334
+ transcript_text = None
335
+
336
+ # --- Fallback 1: Apify Structured Transcript Extractor (NEW) ---
337
  if transcript_text is None:
338
+ logger.info("[Fallback YT 1] Trying Apify Structured Transcript Extractor...")
339
+ if _apify_token_exists:
340
+ transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
341
+ if transcript_text:
342
+ logger.info(f"[Fallback YT 1] Success via Apify Structured Extractor for {video_url}")
343
+ return transcript_text # Return on success
344
+ else:
345
+ logger.warning(f"[Fallback YT 1] Apify Structured Extractor failed or no content for {video_url}.")
346
+ else:
347
+ logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Apify Structured Extractor.")
348
+
349
+ # --- Fallback 2: Supadata API ---
350
+ if transcript_text is None:
351
+ logger.info("[Fallback YT 2] Trying Supadata API...") # <<<< UPDATED NUMBER
352
  if SUPADATA_API_KEY:
353
  transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
354
+ if transcript_text:
355
+ logger.info(f"[Fallback YT 2] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
356
+ return transcript_text # Return on success
357
+ else:
358
+ logger.warning(f"[Fallback YT 2] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
359
+ else:
360
+ logger.warning("[Fallback YT 2] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
361
+
362
+ # --- Fallback 3: Apify Default YT Actor ---
363
  if transcript_text is None:
364
+ logger.info("[Fallback YT 3] Trying Apify REST API (Default YT Actor)...") # <<<< UPDATED NUMBER
365
  if _apify_token_exists:
366
+ # This function already uses the generic helper _run_apify_actor_for_web_content
367
  transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
368
+ if transcript_text:
369
+ logger.info(f"[Fallback YT 3] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
370
+ return transcript_text # Return on success
371
+ else:
372
+ logger.warning(f"[Fallback YT 3] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
373
+ else:
374
+ logger.warning("[Fallback YT 3] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
375
+
376
+ # --- Final Outcome ---
377
+ if transcript_text is None:
378
+ logger.error(f"All methods failed for YT transcript: {video_id}")
379
+ return None # Explicitly return None if all failed
380
+
381
+ # This line should theoretically not be reached if logic above is correct,
382
+ # but return transcript_text just in case.
383
  return transcript_text
384
 
385
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
 
1350
 
1351
  async def health_check(request: Request) -> PlainTextResponse:
1352
  """Simple health check endpoint."""
1353
+ global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL
1354
+ global APIFY_ACTOR_ID, APIFY_STRUCTURED_YT_ACTOR_ID # Add new ID here
1355
  global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
1356
  global _apify_token_exists, _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY
1357
 
1358
+ # ... (rest of the bot status checking logic remains the same) ...
1359
  bot_status = "Not Initialized"; bot_username = "N/A"
1360
  if ptb_app:
1361
  try:
 
1370
  bot_status = f"Error checking status: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
1371
  else: bot_status = "Not Initialized"; bot_username = "N/A"
1372
 
1373
+
1374
+ # <<< Update response string with specific model names AND YT Fallback order >>>
1375
  return PlainTextResponse(
1376
  f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
1377
  f"---\n"
 
1382
  f"4. OpenRouter API: {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n"
1383
  f"---\n"
1384
  f"Content Fetching Status:\n"
1385
+ # --- Updated YT Fallback List ---
1386
+ f"YT Primary (Lib): Enabled\n"
1387
+ f"YT Fallback 1 (Apify Structured): {APIFY_STRUCTURED_YT_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1388
+ f"YT Fallback 2 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
1389
+ f"YT Fallback 3 (Apify Default): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1390
+ # --- End Updated List ---
1391
  f"Web Scrape 1 (Direct+BS4): Enabled\n"
1392
  f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
1393
  f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
1394
+ f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}" # Web scrape order already updated previously
1395
  )
1396
 
1397
  async def telegram_webhook(request: Request) -> Response: