Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -111,6 +111,7 @@ GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001"
|
|
111 |
OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Specific DeepSeek model
|
112 |
|
113 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "1s7eXiaukVuOr4Ueg") # YT Default
|
|
|
114 |
APIFY_CRAWLER_ACTOR_ID = "aYG0l9s7dbB7j3gbS" # Scrape Fallback 4
|
115 |
APIFY_TEXT_SCRAPER_ACTOR_ID = "2gbQiRSpJIIag2FdR" # Scrape Fallback 5
|
116 |
|
@@ -147,9 +148,12 @@ logger.info(f"Summarizer 1 (Groq): {GROQ_LLAMA4_MODEL if _groq_enabled else 'DIS
|
|
147 |
logger.info(f"Summarizer 2 (Gemini Pro Exp): {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}")
|
148 |
logger.info(f"Summarizer 3 (Gemini Flash): {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}")
|
149 |
logger.info(f"Summarizer 4 (OpenRouter): {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
|
150 |
-
|
|
|
|
|
151 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}")
|
152 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}")
|
|
|
153 |
|
154 |
# Flags for scraper key existence
|
155 |
_apify_token_exists = bool(APIFY_API_TOKEN)
|
@@ -275,35 +279,107 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
|
|
275 |
logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True)
|
276 |
return None
|
277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
279 |
-
|
|
|
280 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
|
|
281 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
282 |
-
transcript_text = None
|
|
|
|
|
283 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
284 |
try:
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
if transcript_text is None:
|
293 |
-
logger.info("[Fallback YT 1] Trying
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
if SUPADATA_API_KEY:
|
295 |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
296 |
-
if transcript_text:
|
297 |
-
|
298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
if transcript_text is None:
|
300 |
-
logger.info("[Fallback YT
|
301 |
if _apify_token_exists:
|
|
|
302 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
303 |
-
if transcript_text:
|
304 |
-
|
305 |
-
|
306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
return transcript_text
|
308 |
|
309 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
@@ -1274,10 +1350,12 @@ async def lifespan(app: Starlette):
|
|
1274 |
|
1275 |
async def health_check(request: Request) -> PlainTextResponse:
|
1276 |
"""Simple health check endpoint."""
|
1277 |
-
global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL
|
|
|
1278 |
global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
1279 |
global _apify_token_exists, _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY
|
1280 |
|
|
|
1281 |
bot_status = "Not Initialized"; bot_username = "N/A"
|
1282 |
if ptb_app:
|
1283 |
try:
|
@@ -1292,7 +1370,8 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1292 |
bot_status = f"Error checking status: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
|
1293 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1294 |
|
1295 |
-
|
|
|
1296 |
return PlainTextResponse(
|
1297 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1298 |
f"---\n"
|
@@ -1303,12 +1382,16 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1303 |
f"4. OpenRouter API: {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n"
|
1304 |
f"---\n"
|
1305 |
f"Content Fetching Status:\n"
|
1306 |
-
|
1307 |
-
f"YT
|
|
|
|
|
|
|
|
|
1308 |
f"Web Scrape 1 (Direct+BS4): Enabled\n"
|
1309 |
f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
|
1310 |
f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
|
1311 |
-
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
|
1312 |
)
|
1313 |
|
1314 |
async def telegram_webhook(request: Request) -> Response:
|
|
|
111 |
OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Specific DeepSeek model
|
112 |
|
113 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "1s7eXiaukVuOr4Ueg") # YT Default
|
114 |
+
APIFY_STRUCTURED_YT_ACTOR_ID = "gpjTCWkGZS1lHc9pR" # YT Fallback 1 (New Structured Extractor)
|
115 |
APIFY_CRAWLER_ACTOR_ID = "aYG0l9s7dbB7j3gbS" # Scrape Fallback 4
|
116 |
APIFY_TEXT_SCRAPER_ACTOR_ID = "2gbQiRSpJIIag2FdR" # Scrape Fallback 5
|
117 |
|
|
|
148 |
logger.info(f"Summarizer 2 (Gemini Pro Exp): {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}")
|
149 |
logger.info(f"Summarizer 3 (Gemini Flash): {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}")
|
150 |
logger.info(f"Summarizer 4 (OpenRouter): {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
|
151 |
+
# --- Updated Logging ---
|
152 |
+
logger.info(f"Using Apify Actor (YT Fallback 1 - Structured): {APIFY_STRUCTURED_YT_ACTOR_ID}")
|
153 |
+
logger.info(f"Using Apify Actor (YT Fallback 3 - Default): {APIFY_ACTOR_ID}") # Now fallback 3
|
154 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}")
|
155 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}")
|
156 |
+
# --- End Updated Logging ---
|
157 |
|
158 |
# Flags for scraper key existence
|
159 |
_apify_token_exists = bool(APIFY_API_TOKEN)
|
|
|
279 |
logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True)
|
280 |
return None
|
281 |
|
282 |
+
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
|
283 |
+
"""Fallback YT 1: Fetches YouTube transcript using the Structured Extractor Apify Actor."""
|
284 |
+
global APIFY_STRUCTURED_YT_ACTOR_ID # Use the new ID
|
285 |
+
if not video_url: logger.error("[Apify Structured YT] No video_url provided"); return None
|
286 |
+
if not api_token: logger.error("[Apify Structured YT] API token missing."); return None
|
287 |
+
logger.info(f"[YT Fallback 1] Attempting fetch for URL: {video_url} (Actor: {APIFY_STRUCTURED_YT_ACTOR_ID})")
|
288 |
+
|
289 |
+
# Use the generic helper function.
|
290 |
+
# We assume the standard input format used by the helper for non-specific actors
|
291 |
+
# (like the text scraper: {"urls": [url]}) is sufficient, or that the helper's
|
292 |
+
# existing logic correctly identifies and parses the output from this actor.
|
293 |
+
# The helper already has logic to parse various text/content/captions fields,
|
294 |
+
# including list structures for captions as described in the actor's docs.
|
295 |
+
return await _run_apify_actor_for_web_content(
|
296 |
+
url=video_url,
|
297 |
+
api_token=api_token,
|
298 |
+
actor_id=APIFY_STRUCTURED_YT_ACTOR_ID,
|
299 |
+
actor_name="Apify Structured YT" # Specific name for logging
|
300 |
+
)
|
301 |
+
|
302 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
303 |
+
"""Fetches YouTube transcript using a primary library and multiple fallback methods."""
|
304 |
+
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists # Keep existing globals
|
305 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
306 |
+
|
307 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
308 |
+
transcript_text: Optional[str] = None
|
309 |
+
|
310 |
+
# --- Primary Method: youtube-transcript-api ---
|
311 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
312 |
try:
|
313 |
+
# Prefer English variants first
|
314 |
+
transcript_list = await asyncio.to_thread(
|
315 |
+
YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US']
|
316 |
+
)
|
317 |
+
if transcript_list:
|
318 |
+
transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
319 |
+
if transcript_text:
|
320 |
+
logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})")
|
321 |
+
return transcript_text.strip() # Return immediately on success
|
322 |
+
else:
|
323 |
+
logger.warning(f"[Primary YT] Transcript list returned but text content was empty for {video_id}")
|
324 |
+
transcript_text = None # Ensure it's None to trigger fallbacks
|
325 |
+
except NoTranscriptFound:
|
326 |
+
logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
|
327 |
+
transcript_text = None
|
328 |
+
except TranscriptsDisabled:
|
329 |
+
logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
|
330 |
+
transcript_text = None
|
331 |
+
except Exception as e:
|
332 |
+
# Log more specific errors if possible, e.g., timeouts, network issues
|
333 |
+
logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
|
334 |
+
transcript_text = None
|
335 |
+
|
336 |
+
# --- Fallback 1: Apify Structured Transcript Extractor (NEW) ---
|
337 |
if transcript_text is None:
|
338 |
+
logger.info("[Fallback YT 1] Trying Apify Structured Transcript Extractor...")
|
339 |
+
if _apify_token_exists:
|
340 |
+
transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
|
341 |
+
if transcript_text:
|
342 |
+
logger.info(f"[Fallback YT 1] Success via Apify Structured Extractor for {video_url}")
|
343 |
+
return transcript_text # Return on success
|
344 |
+
else:
|
345 |
+
logger.warning(f"[Fallback YT 1] Apify Structured Extractor failed or no content for {video_url}.")
|
346 |
+
else:
|
347 |
+
logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Apify Structured Extractor.")
|
348 |
+
|
349 |
+
# --- Fallback 2: Supadata API ---
|
350 |
+
if transcript_text is None:
|
351 |
+
logger.info("[Fallback YT 2] Trying Supadata API...") # <<<< UPDATED NUMBER
|
352 |
if SUPADATA_API_KEY:
|
353 |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
354 |
+
if transcript_text:
|
355 |
+
logger.info(f"[Fallback YT 2] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
|
356 |
+
return transcript_text # Return on success
|
357 |
+
else:
|
358 |
+
logger.warning(f"[Fallback YT 2] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
|
359 |
+
else:
|
360 |
+
logger.warning("[Fallback YT 2] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
|
361 |
+
|
362 |
+
# --- Fallback 3: Apify Default YT Actor ---
|
363 |
if transcript_text is None:
|
364 |
+
logger.info("[Fallback YT 3] Trying Apify REST API (Default YT Actor)...") # <<<< UPDATED NUMBER
|
365 |
if _apify_token_exists:
|
366 |
+
# This function already uses the generic helper _run_apify_actor_for_web_content
|
367 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
368 |
+
if transcript_text:
|
369 |
+
logger.info(f"[Fallback YT 3] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
|
370 |
+
return transcript_text # Return on success
|
371 |
+
else:
|
372 |
+
logger.warning(f"[Fallback YT 3] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
373 |
+
else:
|
374 |
+
logger.warning("[Fallback YT 3] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
|
375 |
+
|
376 |
+
# --- Final Outcome ---
|
377 |
+
if transcript_text is None:
|
378 |
+
logger.error(f"All methods failed for YT transcript: {video_id}")
|
379 |
+
return None # Explicitly return None if all failed
|
380 |
+
|
381 |
+
# This line should theoretically not be reached if logic above is correct,
|
382 |
+
# but return transcript_text just in case.
|
383 |
return transcript_text
|
384 |
|
385 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
|
|
1350 |
|
1351 |
async def health_check(request: Request) -> PlainTextResponse:
|
1352 |
"""Simple health check endpoint."""
|
1353 |
+
global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL
|
1354 |
+
global APIFY_ACTOR_ID, APIFY_STRUCTURED_YT_ACTOR_ID # Add new ID here
|
1355 |
global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
1356 |
global _apify_token_exists, _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY
|
1357 |
|
1358 |
+
# ... (rest of the bot status checking logic remains the same) ...
|
1359 |
bot_status = "Not Initialized"; bot_username = "N/A"
|
1360 |
if ptb_app:
|
1361 |
try:
|
|
|
1370 |
bot_status = f"Error checking status: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
|
1371 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1372 |
|
1373 |
+
|
1374 |
+
# <<< Update response string with specific model names AND YT Fallback order >>>
|
1375 |
return PlainTextResponse(
|
1376 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1377 |
f"---\n"
|
|
|
1382 |
f"4. OpenRouter API: {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n"
|
1383 |
f"---\n"
|
1384 |
f"Content Fetching Status:\n"
|
1385 |
+
# --- Updated YT Fallback List ---
|
1386 |
+
f"YT Primary (Lib): Enabled\n"
|
1387 |
+
f"YT Fallback 1 (Apify Structured): {APIFY_STRUCTURED_YT_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
|
1388 |
+
f"YT Fallback 2 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
|
1389 |
+
f"YT Fallback 3 (Apify Default): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
|
1390 |
+
# --- End Updated List ---
|
1391 |
f"Web Scrape 1 (Direct+BS4): Enabled\n"
|
1392 |
f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
|
1393 |
f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
|
1394 |
+
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}" # Web scrape order already updated previously
|
1395 |
)
|
1396 |
|
1397 |
async def telegram_webhook(request: Request) -> Response:
|