Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -9,6 +9,9 @@ import contextlib
|
|
9 |
import traceback
|
10 |
import urllib.parse
|
11 |
from typing import Optional, Dict, Any, Tuple
|
|
|
|
|
|
|
12 |
|
13 |
# --- Frameworks ---
|
14 |
from starlette.applications import Starlette
|
@@ -101,6 +104,10 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
|
101 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
102 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
103 |
|
|
|
|
|
|
|
|
|
104 |
# --- Model Configurations (Specific April 2025 - Updated Order) ---
|
105 |
# New Model Priority:
|
106 |
# 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
|
@@ -239,6 +246,53 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
239 |
return None
|
240 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
# --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
|
243 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
244 |
"""
|
@@ -302,85 +356,68 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
|
|
302 |
|
303 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
304 |
"""
|
305 |
-
Fetches YouTube transcript using multiple fallback methods
|
306 |
-
|
307 |
-
|
308 |
-
|
|
|
309 |
"""
|
310 |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
311 |
-
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
312 |
|
313 |
-
|
|
|
|
|
|
|
|
|
314 |
transcript_text: Optional[str] = None
|
315 |
|
316 |
-
# ---
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
324 |
-
if transcript_text:
|
325 |
-
logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
|
326 |
-
return transcript_text # Return on success
|
327 |
-
else:
|
328 |
-
logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
329 |
else:
|
330 |
-
logger.warning("[Fallback YT
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
342 |
else:
|
343 |
-
logger.warning("[Fallback YT
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
|
355 |
else:
|
356 |
-
logger.warning("[Fallback YT
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
|
|
|
|
|
|
365 |
|
366 |
-
|
367 |
-
"
|
368 |
-
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
369 |
-
try:
|
370 |
-
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
371 |
-
logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
|
372 |
-
response = await client.get(url)
|
373 |
-
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
|
374 |
-
response.raise_for_status()
|
375 |
-
content_type = response.headers.get('content-type', '').lower()
|
376 |
-
if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
|
377 |
-
try: return response.text
|
378 |
-
except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
|
379 |
-
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
|
380 |
-
except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
|
381 |
-
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
|
382 |
-
except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
|
383 |
-
except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
|
384 |
return None
|
385 |
|
386 |
async def get_website_content(url: str) -> Optional[str]:
|
@@ -562,14 +599,16 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
562 |
}
|
563 |
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
|
564 |
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
"
|
570 |
-
|
571 |
-
|
572 |
-
|
|
|
|
|
573 |
# --- END ADDED PROXY CONFIG ---
|
574 |
}
|
575 |
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")
|
|
|
9 |
import traceback
|
10 |
import urllib.parse
|
11 |
from typing import Optional, Dict, Any, Tuple
|
12 |
+
import tempfile, os, asyncio
|
13 |
+
from yt_dlp import YoutubeDL
|
14 |
+
from huggingface_hub import InferenceClient
|
15 |
|
16 |
# --- Frameworks ---
|
17 |
from starlette.applications import Starlette
|
|
|
104 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
105 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
106 |
|
107 |
+
HUGGINGFACE_HUB_TOKEN = os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
108 |
+
# if you don’t set a token it still works on public models (with lower rate limits)
|
109 |
+
_inference_client = InferenceClient(token=HUGGINGFACE_HUB_TOKEN)
|
110 |
+
|
111 |
# --- Model Configurations (Specific April 2025 - Updated Order) ---
|
112 |
# New Model Priority:
|
113 |
# 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
|
|
|
246 |
return None
|
247 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
248 |
|
249 |
+
# ——— new function ———
|
250 |
+
async def get_transcript_via_whisper_inference(video_url: str) -> Optional[str]:
|
251 |
+
"""
|
252 |
+
Fallback YT 4: Download audio via yt-dlp and transcribe with HF's hosted Whisper.
|
253 |
+
"""
|
254 |
+
# 1) download best audio to a temp file
|
255 |
+
tmp_f = None
|
256 |
+
try:
|
257 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
258 |
+
tmp_f = tmp.name
|
259 |
+
tmp.close()
|
260 |
+
|
261 |
+
ydl_opts = {
|
262 |
+
"format": "bestaudio/best",
|
263 |
+
"outtmpl": tmp_f,
|
264 |
+
"quiet": True,
|
265 |
+
"no_warnings": True,
|
266 |
+
}
|
267 |
+
with YoutubeDL(ydl_opts) as ydl:
|
268 |
+
ydl.download([video_url])
|
269 |
+
|
270 |
+
# 2) call the HF inference api in a thread (it’s blocking)
|
271 |
+
def _transcribe():
|
272 |
+
result = _inference_client.audio_to_text(
|
273 |
+
model="openai/whisper-small",
|
274 |
+
inputs=open(tmp_f, "rb"),
|
275 |
+
)
|
276 |
+
# HF returns {"text": "..."}
|
277 |
+
return result.get("text")
|
278 |
+
|
279 |
+
transcript = await asyncio.to_thread(_transcribe)
|
280 |
+
if transcript and isinstance(transcript, str) and transcript.strip():
|
281 |
+
logger.info(f"[Fallback YT 4] Whisper inference succeeded (len {len(transcript)})")
|
282 |
+
return transcript.strip()
|
283 |
+
else:
|
284 |
+
logger.warning("[Fallback YT 4] Whisper inference returned empty transcript")
|
285 |
+
return None
|
286 |
+
|
287 |
+
except Exception as e:
|
288 |
+
logger.error(f"[Fallback YT 4] Whisper inference error: {e}", exc_info=True)
|
289 |
+
return None
|
290 |
+
|
291 |
+
finally:
|
292 |
+
if tmp_f and os.path.exists(tmp_f):
|
293 |
+
try: os.remove(tmp_f)
|
294 |
+
except: pass
|
295 |
+
|
296 |
# --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
|
297 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
298 |
"""
|
|
|
356 |
|
357 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
358 |
"""
|
359 |
+
Fetches YouTube transcript using multiple fallback methods:
|
360 |
+
1. Apify Default Actor (Fallback 1)
|
361 |
+
2. Apify Structured Actor (Fallback 2)
|
362 |
+
3. Supadata API (Fallback 3)
|
363 |
+
4. Whisper via HF Inference (Fallback 4)
|
364 |
"""
|
365 |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
|
|
366 |
|
367 |
+
if not video_id:
|
368 |
+
logger.error("get_youtube_transcript: No video_id provided")
|
369 |
+
return None
|
370 |
+
|
371 |
+
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
372 |
transcript_text: Optional[str] = None
|
373 |
|
374 |
+
# --- Fallback 1: Apify Default Actor ---
|
375 |
+
logger.info("[Fallback YT 1] Trying Apify Default Actor")
|
376 |
+
if _apify_token_exists:
|
377 |
+
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
378 |
+
if transcript_text:
|
379 |
+
logger.info(f"[Fallback YT 1] Success via Apify Default Actor for {video_url}")
|
380 |
+
return transcript_text
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
else:
|
382 |
+
logger.warning(f"[Fallback YT 1] Apify Default Actor failed or returned no content for {video_url}")
|
383 |
+
else:
|
384 |
+
logger.warning("[Fallback YT 1] APIFY_API_TOKEN unavailable. Skipping Apify Default Actor.")
|
385 |
+
|
386 |
+
# --- Fallback 2: Apify Structured Actor ---
|
387 |
+
logger.info("[Fallback YT 2] Trying Apify Structured Actor")
|
388 |
+
if _apify_token_exists:
|
389 |
+
transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
|
390 |
+
if transcript_text:
|
391 |
+
logger.info(f"[Fallback YT 2] Success via Apify Structured Actor for {video_url}")
|
392 |
+
return transcript_text
|
|
|
393 |
else:
|
394 |
+
logger.warning(f"[Fallback YT 2] Apify Structured Actor failed or returned no content for {video_url}")
|
395 |
+
else:
|
396 |
+
logger.warning("[Fallback YT 2] APIFY_API_TOKEN unavailable. Skipping Apify Structured Actor.")
|
397 |
+
|
398 |
+
# --- Fallback 3: Supadata API ---
|
399 |
+
logger.info("[Fallback YT 3] Trying Supadata API")
|
400 |
+
if SUPADATA_API_KEY:
|
401 |
+
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
402 |
+
if transcript_text:
|
403 |
+
logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}")
|
404 |
+
return transcript_text
|
|
|
405 |
else:
|
406 |
+
logger.warning(f"[Fallback YT 3] Supadata failed or returned no content for {video_id}")
|
407 |
+
else:
|
408 |
+
logger.warning("[Fallback YT 3] SUPADATA_API_KEY unavailable. Skipping Supadata API.")
|
409 |
+
|
410 |
+
# --- Fallback 4: Whisper via HF Inference ---
|
411 |
+
logger.info("[Fallback YT 4] Trying audio transcription via Whisper Inference API")
|
412 |
+
transcript_text = await get_transcript_via_whisper_inference(video_url)
|
413 |
+
if transcript_text:
|
414 |
+
logger.info(f"[Fallback YT 4] Success via Whisper Inference for {video_id}")
|
415 |
+
return transcript_text
|
416 |
+
else:
|
417 |
+
logger.warning(f"[Fallback YT 4] Whisper Inference failed or returned empty for {video_id}")
|
418 |
|
419 |
+
# --- All methods failed ---
|
420 |
+
logger.error(f"All fallback methods failed for YT transcript: {video_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
return None
|
422 |
|
423 |
async def get_website_content(url: str) -> Optional[str]:
|
|
|
599 |
}
|
600 |
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
|
601 |
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
|
602 |
+
# Input specific to the Structured YT Actor – wrap in a list even for a single URL
|
603 |
+
run_input = {
|
604 |
+
"urls": [ url ], # ← wrap your URL in a list
|
605 |
+
"proxyConfiguration": {
|
606 |
+
"useApifyProxy": True,
|
607 |
+
"apifyProxyGroups": ["RESIDENTIAL"],
|
608 |
+
},
|
609 |
+
"maxRetries": 5,
|
610 |
+
}
|
611 |
+
logger.debug(f"{log_prefix} Using list input format for Structured YT Actor ({actor_id})")
|
612 |
# --- END ADDED PROXY CONFIG ---
|
613 |
}
|
614 |
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")
|