fmab777 commited on
Commit
f25ac38
·
verified ·
1 Parent(s): f406730

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +117 -78
main.py CHANGED
@@ -9,6 +9,9 @@ import contextlib
9
  import traceback
10
  import urllib.parse
11
  from typing import Optional, Dict, Any, Tuple
 
 
 
12
 
13
  # --- Frameworks ---
14
  from starlette.applications import Starlette
@@ -101,6 +104,10 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
101
  RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
102
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
103
 
 
 
 
 
104
  # --- Model Configurations (Specific April 2025 - Updated Order) ---
105
  # New Model Priority:
106
  # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
@@ -239,6 +246,53 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
239
  return None
240
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  # --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
243
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
244
  """
@@ -302,85 +356,68 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
302
 
303
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
304
  """
305
- Fetches YouTube transcript using multiple fallback methods in the specified order:
306
- 1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
307
- 2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
308
- 3. Supadata API
 
309
  """
310
  global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
311
- if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
312
 
313
- logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
 
 
 
 
314
  transcript_text: Optional[str] = None
315
 
316
- # --- Primary Method: REMOVED (youtube-transcript-api) ---
317
- # logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
318
-
319
- # --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
320
- if transcript_text is None:
321
- logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
322
- if _apify_token_exists:
323
- transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
324
- if transcript_text:
325
- logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
326
- return transcript_text # Return on success
327
- else:
328
- logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
329
  else:
330
- logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
331
-
332
- # --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
333
- if transcript_text is None:
334
- logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
335
- if _apify_token_exists:
336
- transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
337
- if transcript_text:
338
- logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
339
- return transcript_text # Return on success
340
- else:
341
- logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
342
  else:
343
- logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
344
-
345
- # --- Fallback 3: Supadata API ---
346
- if transcript_text is None:
347
- logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
348
- if SUPADATA_API_KEY:
349
- transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
350
- if transcript_text:
351
- logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
352
- return transcript_text # Return on success
353
- else:
354
- logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
355
  else:
356
- logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
357
-
358
- # --- Final Outcome ---
359
- if transcript_text is None:
360
- logger.error(f"All fallback methods failed for YT transcript: {video_id}")
361
- return None # Explicitly return None if all failed
362
-
363
- # This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
364
- return transcript_text
 
 
 
365
 
366
- async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
367
- """Directly fetches URL content using httpx."""
368
- headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
369
- try:
370
- async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
371
- logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
372
- response = await client.get(url)
373
- logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
374
- response.raise_for_status()
375
- content_type = response.headers.get('content-type', '').lower()
376
- if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
377
- try: return response.text
378
- except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
379
- except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
380
- except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
381
- except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
382
- except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
383
- except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
384
  return None
385
 
386
  async def get_website_content(url: str) -> Optional[str]:
@@ -562,14 +599,16 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
562
  }
563
  logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
564
  elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
565
- # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
566
- run_input = {
567
- "urls": url, # <<< STRING format needed here, not list
568
- # --- ADDED RESIDENTIAL PROXY CONFIG ---
569
- "proxyConfiguration": {
570
- "useApifyProxy": True,
571
- "apifyProxyGroups": ["RESIDENTIAL"]
572
- }
 
 
573
  # --- END ADDED PROXY CONFIG ---
574
  }
575
  logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")
 
9
  import traceback
10
  import urllib.parse
11
  from typing import Optional, Dict, Any, Tuple
12
+ import tempfile, os, asyncio
13
+ from yt_dlp import YoutubeDL
14
+ from huggingface_hub import InferenceClient
15
 
16
  # --- Frameworks ---
17
  from starlette.applications import Starlette
 
104
  RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
105
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
106
 
107
+ HUGGINGFACE_HUB_TOKEN = os.environ.get("HUGGINGFACE_HUB_TOKEN")
108
+ # if you don’t set a token it still works on public models (with lower rate limits)
109
+ _inference_client = InferenceClient(token=HUGGINGFACE_HUB_TOKEN)
110
+
111
  # --- Model Configurations (Specific April 2025 - Updated Order) ---
112
  # New Model Priority:
113
  # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
 
246
  return None
247
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
248
 
249
+ # ——— new function ———
250
+ async def get_transcript_via_whisper_inference(video_url: str) -> Optional[str]:
251
+ """
252
+ Fallback YT 4: Download audio via yt-dlp and transcribe with HF's hosted Whisper.
253
+ """
254
+ # 1) download best audio to a temp file
255
+ tmp_f = None
256
+ try:
257
+ tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
258
+ tmp_f = tmp.name
259
+ tmp.close()
260
+
261
+ ydl_opts = {
262
+ "format": "bestaudio/best",
263
+ "outtmpl": tmp_f,
264
+ "quiet": True,
265
+ "no_warnings": True,
266
+ }
267
+ with YoutubeDL(ydl_opts) as ydl:
268
+ ydl.download([video_url])
269
+
270
+ # 2) call the HF inference api in a thread (it’s blocking)
271
+ def _transcribe():
272
+ result = _inference_client.audio_to_text(
273
+ model="openai/whisper-small",
274
+ inputs=open(tmp_f, "rb"),
275
+ )
276
+ # HF returns {"text": "..."}
277
+ return result.get("text")
278
+
279
+ transcript = await asyncio.to_thread(_transcribe)
280
+ if transcript and isinstance(transcript, str) and transcript.strip():
281
+ logger.info(f"[Fallback YT 4] Whisper inference succeeded (len {len(transcript)})")
282
+ return transcript.strip()
283
+ else:
284
+ logger.warning("[Fallback YT 4] Whisper inference returned empty transcript")
285
+ return None
286
+
287
+ except Exception as e:
288
+ logger.error(f"[Fallback YT 4] Whisper inference error: {e}", exc_info=True)
289
+ return None
290
+
291
+ finally:
292
+ if tmp_f and os.path.exists(tmp_f):
293
+ try: os.remove(tmp_f)
294
+ except: pass
295
+
296
  # --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
297
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
298
  """
 
356
 
357
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
358
  """
359
+ Fetches YouTube transcript using multiple fallback methods:
360
+ 1. Apify Default Actor (Fallback 1)
361
+ 2. Apify Structured Actor (Fallback 2)
362
+ 3. Supadata API (Fallback 3)
363
+ 4. Whisper via HF Inference (Fallback 4)
364
  """
365
  global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
 
366
 
367
+ if not video_id:
368
+ logger.error("get_youtube_transcript: No video_id provided")
369
+ return None
370
+
371
+ logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
372
  transcript_text: Optional[str] = None
373
 
374
+ # --- Fallback 1: Apify Default Actor ---
375
+ logger.info("[Fallback YT 1] Trying Apify Default Actor")
376
+ if _apify_token_exists:
377
+ transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
378
+ if transcript_text:
379
+ logger.info(f"[Fallback YT 1] Success via Apify Default Actor for {video_url}")
380
+ return transcript_text
 
 
 
 
 
 
381
  else:
382
+ logger.warning(f"[Fallback YT 1] Apify Default Actor failed or returned no content for {video_url}")
383
+ else:
384
+ logger.warning("[Fallback YT 1] APIFY_API_TOKEN unavailable. Skipping Apify Default Actor.")
385
+
386
+ # --- Fallback 2: Apify Structured Actor ---
387
+ logger.info("[Fallback YT 2] Trying Apify Structured Actor")
388
+ if _apify_token_exists:
389
+ transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
390
+ if transcript_text:
391
+ logger.info(f"[Fallback YT 2] Success via Apify Structured Actor for {video_url}")
392
+ return transcript_text
 
393
  else:
394
+ logger.warning(f"[Fallback YT 2] Apify Structured Actor failed or returned no content for {video_url}")
395
+ else:
396
+ logger.warning("[Fallback YT 2] APIFY_API_TOKEN unavailable. Skipping Apify Structured Actor.")
397
+
398
+ # --- Fallback 3: Supadata API ---
399
+ logger.info("[Fallback YT 3] Trying Supadata API")
400
+ if SUPADATA_API_KEY:
401
+ transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
402
+ if transcript_text:
403
+ logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}")
404
+ return transcript_text
 
405
  else:
406
+ logger.warning(f"[Fallback YT 3] Supadata failed or returned no content for {video_id}")
407
+ else:
408
+ logger.warning("[Fallback YT 3] SUPADATA_API_KEY unavailable. Skipping Supadata API.")
409
+
410
+ # --- Fallback 4: Whisper via HF Inference ---
411
+ logger.info("[Fallback YT 4] Trying audio transcription via Whisper Inference API")
412
+ transcript_text = await get_transcript_via_whisper_inference(video_url)
413
+ if transcript_text:
414
+ logger.info(f"[Fallback YT 4] Success via Whisper Inference for {video_id}")
415
+ return transcript_text
416
+ else:
417
+ logger.warning(f"[Fallback YT 4] Whisper Inference failed or returned empty for {video_id}")
418
 
419
+ # --- All methods failed ---
420
+ logger.error(f"All fallback methods failed for YT transcript: {video_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  return None
422
 
423
  async def get_website_content(url: str) -> Optional[str]:
 
599
  }
600
  logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
601
  elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
602
+ # Input specific to the Structured YT Actor wrap in a list even for a single URL
603
+ run_input = {
604
+ "urls": [ url ], # wrap your URL in a list
605
+ "proxyConfiguration": {
606
+ "useApifyProxy": True,
607
+ "apifyProxyGroups": ["RESIDENTIAL"],
608
+ },
609
+ "maxRetries": 5,
610
+ }
611
+ logger.debug(f"{log_prefix} Using list input format for Structured YT Actor ({actor_id})")
612
  # --- END ADDED PROXY CONFIG ---
613
  }
614
  logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")