Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -205,9 +205,11 @@ def extract_youtube_id(url):
|
|
205 |
# --- Content Fetching Functions ---
|
206 |
|
207 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
|
|
208 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
209 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
210 |
-
|
|
|
211 |
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"
|
212 |
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
|
213 |
try:
|
@@ -220,7 +222,10 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
220 |
content = None
|
221 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
222 |
if not content and response.text: content = response.text
|
223 |
-
if content and isinstance(content, str):
|
|
|
|
|
|
|
224 |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
|
225 |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
|
226 |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
|
@@ -235,69 +240,27 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
235 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
236 |
|
237 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
238 |
-
"""Fallback YT
|
239 |
global APIFY_ACTOR_ID
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
|
252 |
-
|
253 |
-
if response.status_code in [200, 201]:
|
254 |
-
if response.status_code == 201:
|
255 |
-
logger.info(f"[Apify YT] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
256 |
-
try: # INNER TRY for JSON processing
|
257 |
-
results = response.json(); content = None
|
258 |
-
# ... (existing logic for processing successful response) ...
|
259 |
-
if isinstance(results, list) and len(results) > 0:
|
260 |
-
# ... (extract content) ...
|
261 |
-
if content and isinstance(content, str):
|
262 |
-
logger.info(f"[Apify YT] Success via REST (Status {response.status_code}) for {video_url}. Length: {len(content)}")
|
263 |
-
return content.strip()
|
264 |
-
else:
|
265 |
-
logger.warning(f"[Apify YT] Dataset item parsed (Status {response.status_code}) but transcript content empty/invalid format for {video_url}. Item keys: {list(results[0].keys() if results and isinstance(results[0],dict) else [])}")
|
266 |
-
return None
|
267 |
-
else:
|
268 |
-
logger.warning(f"[Apify YT] Actor call successful (Status {response.status_code}) but dataset was empty for {video_url}. Response: {results}")
|
269 |
-
return None
|
270 |
-
except json.JSONDecodeError: # INNER EXCEPT
|
271 |
-
logger.error(f"[Apify YT] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
|
272 |
-
return None
|
273 |
-
except Exception as e: # INNER EXCEPT
|
274 |
-
logger.error(f"[Apify YT] Error processing success response (Status {response.status_code}) for {video_url}: {e}", exc_info=True);
|
275 |
-
return None
|
276 |
-
# ELIF chain for specific HTTP error codes from the API call
|
277 |
-
elif response.status_code == 400: logger.error(f"[Apify YT] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None
|
278 |
-
elif response.status_code == 401: logger.error("[Apify YT] Auth error (401). Check token."); return None
|
279 |
-
elif response.status_code == 404: logger.error(f"[Apify YT] Endpoint/Actor Not Found (404). Actor: {APIFY_ACTOR_ID} Resp:{response.text[:200]}"); return None
|
280 |
-
else: logger.error(f"[Apify YT] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None
|
281 |
-
# --- ADDED EXCEPTIONS FOR OUTER TRY ---
|
282 |
-
except httpx.TimeoutException as e:
|
283 |
-
logger.error(f"[Apify YT] Timeout during API interaction for {video_url}: {e}")
|
284 |
-
return None
|
285 |
-
except httpx.HTTPStatusError as e: # In case raise_for_status is added later or for specific HTTP errors not caught above
|
286 |
-
logger.error(f"[Apify YT] HTTP Status Error during API interaction for {video_url}: {e}")
|
287 |
-
return None
|
288 |
-
except httpx.RequestError as e: # Catches connection errors, DNS errors etc.
|
289 |
-
logger.error(f"[Apify YT] Request error during API interaction for {video_url}: {e}")
|
290 |
-
return None
|
291 |
-
except Exception as e: # Catch-all for any other unexpected error during the httpx call setup/execution
|
292 |
-
logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True)
|
293 |
-
return None
|
294 |
|
295 |
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
|
296 |
-
"""Fallback YT
|
297 |
global APIFY_STRUCTURED_YT_ACTOR_ID # Use the new ID
|
298 |
if not video_url: logger.error("[Apify Structured YT] No video_url provided"); return None
|
299 |
if not api_token: logger.error("[Apify Structured YT] API token missing."); return None
|
300 |
-
|
|
|
301 |
|
302 |
# Use the generic helper function.
|
303 |
# We assume the standard input format used by the helper for non-specific actors
|
@@ -309,90 +272,71 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
|
|
309 |
url=video_url,
|
310 |
api_token=api_token,
|
311 |
actor_id=APIFY_STRUCTURED_YT_ACTOR_ID,
|
312 |
-
|
|
|
313 |
)
|
314 |
|
315 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
316 |
-
"""
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
318 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
319 |
|
320 |
-
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
321 |
transcript_text: Optional[str] = None
|
322 |
|
323 |
-
# --- Primary Method: youtube-transcript-api ---
|
324 |
-
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
325 |
-
try:
|
326 |
-
# Prefer English variants first
|
327 |
-
transcript_list = await asyncio.to_thread(
|
328 |
-
YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US']
|
329 |
-
)
|
330 |
-
if transcript_list:
|
331 |
-
transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
332 |
-
if transcript_text:
|
333 |
-
logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})")
|
334 |
-
return transcript_text.strip() # Return immediately on success
|
335 |
-
else:
|
336 |
-
logger.warning(f"[Primary YT] Transcript list returned but text content was empty for {video_id}")
|
337 |
-
transcript_text = None # Ensure it's None to trigger fallbacks
|
338 |
-
except NoTranscriptFound:
|
339 |
-
logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
|
340 |
-
transcript_text = None
|
341 |
-
except TranscriptsDisabled:
|
342 |
-
logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
|
343 |
-
transcript_text = None
|
344 |
-
except Exception as e:
|
345 |
-
# Log more specific errors if possible, e.g., timeouts, network issues
|
346 |
-
logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
|
347 |
-
transcript_text = None
|
348 |
|
349 |
-
# --- Fallback 1: Apify
|
350 |
if transcript_text is None:
|
351 |
-
logger.info("[Fallback YT 1] Trying Apify
|
352 |
if _apify_token_exists:
|
353 |
-
transcript_text = await
|
354 |
if transcript_text:
|
355 |
-
logger.info(f"[Fallback YT 1] Success via Apify
|
356 |
return transcript_text # Return on success
|
357 |
else:
|
358 |
-
logger.warning(f"[Fallback YT 1] Apify
|
359 |
else:
|
360 |
-
logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping
|
361 |
|
362 |
-
# --- Fallback 2:
|
363 |
if transcript_text is None:
|
364 |
-
logger.info("[Fallback YT 2] Trying
|
365 |
-
if
|
366 |
-
transcript_text = await
|
367 |
if transcript_text:
|
368 |
-
logger.info(f"[Fallback YT 2] Success via
|
369 |
return transcript_text # Return on success
|
370 |
else:
|
371 |
-
logger.warning(f"[Fallback YT 2]
|
372 |
else:
|
373 |
-
logger.warning("[Fallback YT 2]
|
374 |
|
375 |
-
# --- Fallback 3:
|
376 |
if transcript_text is None:
|
377 |
-
logger.info("[Fallback YT 3] Trying
|
378 |
-
if
|
379 |
-
|
380 |
-
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
381 |
if transcript_text:
|
382 |
-
logger.info(f"[Fallback YT 3] Success via
|
383 |
return transcript_text # Return on success
|
384 |
else:
|
385 |
-
logger.warning(f"[Fallback YT 3]
|
386 |
else:
|
387 |
-
logger.warning("[Fallback YT 3]
|
388 |
|
389 |
# --- Final Outcome ---
|
390 |
if transcript_text is None:
|
391 |
-
logger.error(f"All methods failed for YT transcript: {video_id}")
|
392 |
return None # Explicitly return None if all failed
|
393 |
|
394 |
-
# This line should
|
395 |
-
# but return transcript_text just in case.
|
396 |
return transcript_text
|
397 |
|
398 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
@@ -1505,7 +1449,7 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1505 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1506 |
|
1507 |
|
1508 |
-
# <<< Update response string with the
|
1509 |
return PlainTextResponse(
|
1510 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1511 |
f"---\n"
|
@@ -1517,12 +1461,12 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1517 |
f"5. Groq API: {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'} (Last Fallback)\n"
|
1518 |
f"---\n"
|
1519 |
f"Content Fetching Status:\n"
|
1520 |
-
# --- YT Fallback List (
|
1521 |
-
f"YT Primary (Lib):
|
1522 |
-
f"YT Fallback 1 (Apify
|
1523 |
-
f"YT Fallback 2 (
|
1524 |
-
f"YT Fallback 3 (
|
1525 |
-
# --- Web Scrape Fallback List (Order
|
1526 |
f"Web Scrape 1 (Direct+BS4): Enabled\n"
|
1527 |
f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
|
1528 |
f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
|
|
|
205 |
# --- Content Fetching Functions ---
|
206 |
|
207 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
208 |
+
"""Fallback YT 3: Fetches YouTube transcript using Supadata API.""" # <<< UPDATED DOCSTRING
|
209 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
210 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
211 |
+
# <<< UPDATED LOG MESSAGE NUMBER >>>
|
212 |
+
logger.info(f"[YT Fallback 3] Attempting fetch for video ID: {video_id} via Supadata")
|
213 |
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"
|
214 |
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
|
215 |
try:
|
|
|
222 |
content = None
|
223 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
224 |
if not content and response.text: content = response.text
|
225 |
+
if content and isinstance(content, str):
|
226 |
+
# <<< UPDATED LOG MESSAGE NUMBER >>>
|
227 |
+
logger.info(f"[Supadata] Success (Fallback 3) for {video_id}. Length: {len(content)}");
|
228 |
+
return content.strip()
|
229 |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
|
230 |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
|
231 |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
|
|
|
240 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
241 |
|
242 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
243 |
+
"""Fallback YT 1: Fetches YouTube transcript using default Apify Actor via generic function.""" # <<< UPDATED DOCSTRING & NUMBER
|
244 |
global APIFY_ACTOR_ID
|
245 |
+
# The specific run_input logic is now handled within _run_apify_actor_for_web_content
|
246 |
+
# when it detects the actor_id matches APIFY_ACTOR_ID
|
247 |
+
# <<< UPDATED LOG MESSAGE >>>
|
248 |
+
logger.debug(f"[get_transcript_via_apify - Fallback 1] Calling generic runner for URL: {video_url}")
|
249 |
+
return await _run_apify_actor_for_web_content(
|
250 |
+
url=video_url, # Pass video_url as the 'url' parameter
|
251 |
+
api_token=api_token,
|
252 |
+
actor_id=APIFY_ACTOR_ID,
|
253 |
+
# <<< UPDATED ACTOR NAME IN LOGS >>>
|
254 |
+
actor_name="Apify YT Default (Fallback 1)"
|
255 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
|
258 |
+
"""Fallback YT 2: Fetches YouTube transcript using the Structured Extractor Apify Actor.""" # <<< UPDATED DOCSTRING & NUMBER
|
259 |
global APIFY_STRUCTURED_YT_ACTOR_ID # Use the new ID
|
260 |
if not video_url: logger.error("[Apify Structured YT] No video_url provided"); return None
|
261 |
if not api_token: logger.error("[Apify Structured YT] API token missing."); return None
|
262 |
+
# <<< UPDATED LOG MESSAGE NUMBER >>>
|
263 |
+
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_STRUCTURED_YT_ACTOR_ID})")
|
264 |
|
265 |
# Use the generic helper function.
|
266 |
# We assume the standard input format used by the helper for non-specific actors
|
|
|
272 |
url=video_url,
|
273 |
api_token=api_token,
|
274 |
actor_id=APIFY_STRUCTURED_YT_ACTOR_ID,
|
275 |
+
# <<< UPDATED ACTOR NAME IN LOGS >>>
|
276 |
+
actor_name="Apify Structured YT (Fallback 2)" # Specific name for logging
|
277 |
)
|
278 |
|
279 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
280 |
+
"""
|
281 |
+
Fetches YouTube transcript using multiple fallback methods in the specified order:
|
282 |
+
1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
|
283 |
+
2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
|
284 |
+
3. Supadata API
|
285 |
+
"""
|
286 |
+
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
287 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
288 |
|
289 |
+
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
|
290 |
transcript_text: Optional[str] = None
|
291 |
|
292 |
+
# --- Primary Method: REMOVED (youtube-transcript-api) ---
|
293 |
+
# logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
+
# --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
|
296 |
if transcript_text is None:
|
297 |
+
logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
|
298 |
if _apify_token_exists:
|
299 |
+
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
300 |
if transcript_text:
|
301 |
+
logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
|
302 |
return transcript_text # Return on success
|
303 |
else:
|
304 |
+
logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
305 |
else:
|
306 |
+
logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
|
307 |
|
308 |
+
# --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
|
309 |
if transcript_text is None:
|
310 |
+
logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
|
311 |
+
if _apify_token_exists:
|
312 |
+
transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
|
313 |
if transcript_text:
|
314 |
+
logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
|
315 |
return transcript_text # Return on success
|
316 |
else:
|
317 |
+
logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
318 |
else:
|
319 |
+
logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
|
320 |
|
321 |
+
# --- Fallback 3: Supadata API ---
|
322 |
if transcript_text is None:
|
323 |
+
logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
|
324 |
+
if SUPADATA_API_KEY:
|
325 |
+
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
|
|
326 |
if transcript_text:
|
327 |
+
logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
|
328 |
return transcript_text # Return on success
|
329 |
else:
|
330 |
+
logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
|
331 |
else:
|
332 |
+
logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
|
333 |
|
334 |
# --- Final Outcome ---
|
335 |
if transcript_text is None:
|
336 |
+
logger.error(f"All fallback methods failed for YT transcript: {video_id}")
|
337 |
return None # Explicitly return None if all failed
|
338 |
|
339 |
+
# This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
|
|
|
340 |
return transcript_text
|
341 |
|
342 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
|
|
1449 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1450 |
|
1451 |
|
1452 |
+
# <<< Update response string with the NEW YT fallback order >>>
|
1453 |
return PlainTextResponse(
|
1454 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1455 |
f"---\n"
|
|
|
1461 |
f"5. Groq API: {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'} (Last Fallback)\n"
|
1462 |
f"---\n"
|
1463 |
f"Content Fetching Status:\n"
|
1464 |
+
# --- YT Fallback List (NEW ORDER) ---
|
1465 |
+
f"YT Primary (Lib): REMOVED\n"
|
1466 |
+
f"YT Fallback 1 (Apify Default): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
|
1467 |
+
f"YT Fallback 2 (Apify Structured): {APIFY_STRUCTURED_YT_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
|
1468 |
+
f"YT Fallback 3 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
|
1469 |
+
# --- Web Scrape Fallback List (Order Unchanged) ---
|
1470 |
f"Web Scrape 1 (Direct+BS4): Enabled\n"
|
1471 |
f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
|
1472 |
f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
|