fmab777 commited on
Commit
4afcd87
·
verified ·
1 Parent(s): 09c445f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +63 -119
main.py CHANGED
@@ -205,9 +205,11 @@ def extract_youtube_id(url):
205
  # --- Content Fetching Functions ---
206
 
207
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
 
208
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
209
  if not api_key: logger.error("[Supadata] API key missing."); return None
210
- logger.info(f"[YT Fallback 1] Attempting fetch for video ID: {video_id} via Supadata")
 
211
  api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"
212
  params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
213
  try:
@@ -220,7 +222,10 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
220
  content = None
221
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
222
  if not content and response.text: content = response.text
223
- if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
 
 
 
224
  else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
225
  except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
226
  except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
@@ -235,69 +240,27 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
235
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
236
 
237
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
238
- """Fallback YT 2: Fetches YouTube transcript using default Apify Actor."""
239
  global APIFY_ACTOR_ID
240
- if not video_url: logger.error("[Apify YT] No video_url provided"); return None
241
- if not api_token: logger.error("[Apify YT] API token missing."); return None
242
- logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
243
- sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
244
- params = {"token": api_token}
245
- payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
246
- headers = {"Content-Type": "application/json"}
247
- try: # OUTER TRY Block
248
- async with httpx.AsyncClient(timeout=120.0) as client:
249
- logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
250
- response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
251
- logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
252
-
253
- if response.status_code in [200, 201]:
254
- if response.status_code == 201:
255
- logger.info(f"[Apify YT] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
256
- try: # INNER TRY for JSON processing
257
- results = response.json(); content = None
258
- # ... (existing logic for processing successful response) ...
259
- if isinstance(results, list) and len(results) > 0:
260
- # ... (extract content) ...
261
- if content and isinstance(content, str):
262
- logger.info(f"[Apify YT] Success via REST (Status {response.status_code}) for {video_url}. Length: {len(content)}")
263
- return content.strip()
264
- else:
265
- logger.warning(f"[Apify YT] Dataset item parsed (Status {response.status_code}) but transcript content empty/invalid format for {video_url}. Item keys: {list(results[0].keys() if results and isinstance(results[0],dict) else [])}")
266
- return None
267
- else:
268
- logger.warning(f"[Apify YT] Actor call successful (Status {response.status_code}) but dataset was empty for {video_url}. Response: {results}")
269
- return None
270
- except json.JSONDecodeError: # INNER EXCEPT
271
- logger.error(f"[Apify YT] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
272
- return None
273
- except Exception as e: # INNER EXCEPT
274
- logger.error(f"[Apify YT] Error processing success response (Status {response.status_code}) for {video_url}: {e}", exc_info=True);
275
- return None
276
- # ELIF chain for specific HTTP error codes from the API call
277
- elif response.status_code == 400: logger.error(f"[Apify YT] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None
278
- elif response.status_code == 401: logger.error("[Apify YT] Auth error (401). Check token."); return None
279
- elif response.status_code == 404: logger.error(f"[Apify YT] Endpoint/Actor Not Found (404). Actor: {APIFY_ACTOR_ID} Resp:{response.text[:200]}"); return None
280
- else: logger.error(f"[Apify YT] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None
281
- # --- ADDED EXCEPTIONS FOR OUTER TRY ---
282
- except httpx.TimeoutException as e:
283
- logger.error(f"[Apify YT] Timeout during API interaction for {video_url}: {e}")
284
- return None
285
- except httpx.HTTPStatusError as e: # In case raise_for_status is added later or for specific HTTP errors not caught above
286
- logger.error(f"[Apify YT] HTTP Status Error during API interaction for {video_url}: {e}")
287
- return None
288
- except httpx.RequestError as e: # Catches connection errors, DNS errors etc.
289
- logger.error(f"[Apify YT] Request error during API interaction for {video_url}: {e}")
290
- return None
291
- except Exception as e: # Catch-all for any other unexpected error during the httpx call setup/execution
292
- logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True)
293
- return None
294
 
295
  async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
296
- """Fallback YT 1: Fetches YouTube transcript using the Structured Extractor Apify Actor."""
297
  global APIFY_STRUCTURED_YT_ACTOR_ID # Use the new ID
298
  if not video_url: logger.error("[Apify Structured YT] No video_url provided"); return None
299
  if not api_token: logger.error("[Apify Structured YT] API token missing."); return None
300
- logger.info(f"[YT Fallback 1] Attempting fetch for URL: {video_url} (Actor: {APIFY_STRUCTURED_YT_ACTOR_ID})")
 
301
 
302
  # Use the generic helper function.
303
  # We assume the standard input format used by the helper for non-specific actors
@@ -309,90 +272,71 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
309
  url=video_url,
310
  api_token=api_token,
311
  actor_id=APIFY_STRUCTURED_YT_ACTOR_ID,
312
- actor_name="Apify Structured YT" # Specific name for logging
 
313
  )
314
 
315
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
316
- """Fetches YouTube transcript using a primary library and multiple fallback methods."""
317
- global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists # Keep existing globals
 
 
 
 
 
318
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
319
 
320
- logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
321
  transcript_text: Optional[str] = None
322
 
323
- # --- Primary Method: youtube-transcript-api ---
324
- logger.info("[Primary YT] Attempting youtube-transcript-api...")
325
- try:
326
- # Prefer English variants first
327
- transcript_list = await asyncio.to_thread(
328
- YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US']
329
- )
330
- if transcript_list:
331
- transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
332
- if transcript_text:
333
- logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})")
334
- return transcript_text.strip() # Return immediately on success
335
- else:
336
- logger.warning(f"[Primary YT] Transcript list returned but text content was empty for {video_id}")
337
- transcript_text = None # Ensure it's None to trigger fallbacks
338
- except NoTranscriptFound:
339
- logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
340
- transcript_text = None
341
- except TranscriptsDisabled:
342
- logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
343
- transcript_text = None
344
- except Exception as e:
345
- # Log more specific errors if possible, e.g., timeouts, network issues
346
- logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
347
- transcript_text = None
348
 
349
- # --- Fallback 1: Apify Structured Transcript Extractor (NEW) ---
350
  if transcript_text is None:
351
- logger.info("[Fallback YT 1] Trying Apify Structured Transcript Extractor...")
352
  if _apify_token_exists:
353
- transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
354
  if transcript_text:
355
- logger.info(f"[Fallback YT 1] Success via Apify Structured Extractor for {video_url}")
356
  return transcript_text # Return on success
357
  else:
358
- logger.warning(f"[Fallback YT 1] Apify Structured Extractor failed or no content for {video_url}.")
359
  else:
360
- logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Apify Structured Extractor.")
361
 
362
- # --- Fallback 2: Supadata API ---
363
  if transcript_text is None:
364
- logger.info("[Fallback YT 2] Trying Supadata API...") # <<<< UPDATED NUMBER
365
- if SUPADATA_API_KEY:
366
- transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
367
  if transcript_text:
368
- logger.info(f"[Fallback YT 2] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
369
  return transcript_text # Return on success
370
  else:
371
- logger.warning(f"[Fallback YT 2] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
372
  else:
373
- logger.warning("[Fallback YT 2] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
374
 
375
- # --- Fallback 3: Apify Default YT Actor ---
376
  if transcript_text is None:
377
- logger.info("[Fallback YT 3] Trying Apify REST API (Default YT Actor)...") # <<<< UPDATED NUMBER
378
- if _apify_token_exists:
379
- # This function already uses the generic helper _run_apify_actor_for_web_content
380
- transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
381
  if transcript_text:
382
- logger.info(f"[Fallback YT 3] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
383
  return transcript_text # Return on success
384
  else:
385
- logger.warning(f"[Fallback YT 3] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
386
  else:
387
- logger.warning("[Fallback YT 3] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
388
 
389
  # --- Final Outcome ---
390
  if transcript_text is None:
391
- logger.error(f"All methods failed for YT transcript: {video_id}")
392
  return None # Explicitly return None if all failed
393
 
394
- # This line should theoretically not be reached if logic above is correct,
395
- # but return transcript_text just in case.
396
  return transcript_text
397
 
398
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
@@ -1505,7 +1449,7 @@ async def health_check(request: Request) -> PlainTextResponse:
1505
  else: bot_status = "Not Initialized"; bot_username = "N/A"
1506
 
1507
 
1508
- # <<< Update response string with the new model order >>>
1509
  return PlainTextResponse(
1510
  f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
1511
  f"---\n"
@@ -1517,12 +1461,12 @@ async def health_check(request: Request) -> PlainTextResponse:
1517
  f"5. Groq API: {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'} (Last Fallback)\n"
1518
  f"---\n"
1519
  f"Content Fetching Status:\n"
1520
- # --- YT Fallback List (Order already correct) ---
1521
- f"YT Primary (Lib): Enabled\n"
1522
- f"YT Fallback 1 (Apify Structured): {APIFY_STRUCTURED_YT_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1523
- f"YT Fallback 2 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
1524
- f"YT Fallback 3 (Apify Default): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1525
- # --- Web Scrape Fallback List (Order already correct) ---
1526
  f"Web Scrape 1 (Direct+BS4): Enabled\n"
1527
  f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
1528
  f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
 
205
  # --- Content Fetching Functions ---
206
 
207
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
208
+ """Fallback YT 3: Fetches YouTube transcript using Supadata API.""" # <<< UPDATED DOCSTRING
209
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
210
  if not api_key: logger.error("[Supadata] API key missing."); return None
211
+ # <<< UPDATED LOG MESSAGE NUMBER >>>
212
+ logger.info(f"[YT Fallback 3] Attempting fetch for video ID: {video_id} via Supadata")
213
  api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"
214
  params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
215
  try:
 
222
  content = None
223
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
224
  if not content and response.text: content = response.text
225
+ if content and isinstance(content, str):
226
+ # <<< UPDATED LOG MESSAGE NUMBER >>>
227
+ logger.info(f"[Supadata] Success (Fallback 3) for {video_id}. Length: {len(content)}");
228
+ return content.strip()
229
  else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
230
  except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
231
  except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
 
240
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
241
 
242
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
243
+ """Fallback YT 1: Fetches YouTube transcript using default Apify Actor via generic function.""" # <<< UPDATED DOCSTRING & NUMBER
244
  global APIFY_ACTOR_ID
245
+ # The specific run_input logic is now handled within _run_apify_actor_for_web_content
246
+ # when it detects the actor_id matches APIFY_ACTOR_ID
247
+ # <<< UPDATED LOG MESSAGE >>>
248
+ logger.debug(f"[get_transcript_via_apify - Fallback 1] Calling generic runner for URL: {video_url}")
249
+ return await _run_apify_actor_for_web_content(
250
+ url=video_url, # Pass video_url as the 'url' parameter
251
+ api_token=api_token,
252
+ actor_id=APIFY_ACTOR_ID,
253
+ # <<< UPDATED ACTOR NAME IN LOGS >>>
254
+ actor_name="Apify YT Default (Fallback 1)"
255
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
258
+ """Fallback YT 2: Fetches YouTube transcript using the Structured Extractor Apify Actor.""" # <<< UPDATED DOCSTRING & NUMBER
259
  global APIFY_STRUCTURED_YT_ACTOR_ID # Use the new ID
260
  if not video_url: logger.error("[Apify Structured YT] No video_url provided"); return None
261
  if not api_token: logger.error("[Apify Structured YT] API token missing."); return None
262
+ # <<< UPDATED LOG MESSAGE NUMBER >>>
263
+ logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_STRUCTURED_YT_ACTOR_ID})")
264
 
265
  # Use the generic helper function.
266
  # We assume the standard input format used by the helper for non-specific actors
 
272
  url=video_url,
273
  api_token=api_token,
274
  actor_id=APIFY_STRUCTURED_YT_ACTOR_ID,
275
+ # <<< UPDATED ACTOR NAME IN LOGS >>>
276
+ actor_name="Apify Structured YT (Fallback 2)" # Specific name for logging
277
  )
278
 
279
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
280
+ """
281
+ Fetches YouTube transcript using multiple fallback methods in the specified order:
282
+ 1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
283
+ 2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
284
+ 3. Supadata API
285
+ """
286
+ global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
287
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
288
 
289
+ logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
290
  transcript_text: Optional[str] = None
291
 
292
+ # --- Primary Method: REMOVED (youtube-transcript-api) ---
293
+ # logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
+ # --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
296
  if transcript_text is None:
297
+ logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
298
  if _apify_token_exists:
299
+ transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
300
  if transcript_text:
301
+ logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
302
  return transcript_text # Return on success
303
  else:
304
+ logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
305
  else:
306
+ logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
307
 
308
+ # --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
309
  if transcript_text is None:
310
+ logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
311
+ if _apify_token_exists:
312
+ transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
313
  if transcript_text:
314
+ logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
315
  return transcript_text # Return on success
316
  else:
317
+ logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
318
  else:
319
+ logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
320
 
321
+ # --- Fallback 3: Supadata API ---
322
  if transcript_text is None:
323
+ logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
324
+ if SUPADATA_API_KEY:
325
+ transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
 
326
  if transcript_text:
327
+ logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
328
  return transcript_text # Return on success
329
  else:
330
+ logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
331
  else:
332
+ logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
333
 
334
  # --- Final Outcome ---
335
  if transcript_text is None:
336
+ logger.error(f"All fallback methods failed for YT transcript: {video_id}")
337
  return None # Explicitly return None if all failed
338
 
339
+ # This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
 
340
  return transcript_text
341
 
342
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
 
1449
  else: bot_status = "Not Initialized"; bot_username = "N/A"
1450
 
1451
 
1452
+ # <<< Update response string with the NEW YT fallback order >>>
1453
  return PlainTextResponse(
1454
  f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
1455
  f"---\n"
 
1461
  f"5. Groq API: {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'} (Last Fallback)\n"
1462
  f"---\n"
1463
  f"Content Fetching Status:\n"
1464
+ # --- YT Fallback List (NEW ORDER) ---
1465
+ f"YT Primary (Lib): REMOVED\n"
1466
+ f"YT Fallback 1 (Apify Default): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1467
+ f"YT Fallback 2 (Apify Structured): {APIFY_STRUCTURED_YT_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1468
+ f"YT Fallback 3 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
1469
+ # --- Web Scrape Fallback List (Order Unchanged) ---
1470
  f"Web Scrape 1 (Direct+BS4): Enabled\n"
1471
  f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
1472
  f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"