Update main.py
Browse files
main.py
CHANGED
|
@@ -9,9 +9,6 @@ import contextlib
|
|
| 9 |
import traceback
|
| 10 |
import urllib.parse
|
| 11 |
from typing import Optional, Dict, Any, Tuple
|
| 12 |
-
import tempfile, os, asyncio
|
| 13 |
-
from yt_dlp import YoutubeDL
|
| 14 |
-
from huggingface_hub import InferenceClient
|
| 15 |
|
| 16 |
# --- Frameworks ---
|
| 17 |
from starlette.applications import Starlette
|
|
@@ -104,10 +101,6 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
|
| 104 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
| 105 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
| 106 |
|
| 107 |
-
HUGGINGFACE_HUB_TOKEN = os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
| 108 |
-
# if you don’t set a token it still works on public models (with lower rate limits)
|
| 109 |
-
_inference_client = InferenceClient(token=HUGGINGFACE_HUB_TOKEN)
|
| 110 |
-
|
| 111 |
# --- Model Configurations (Specific April 2025 - Updated Order) ---
|
| 112 |
# New Model Priority:
|
| 113 |
# 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
|
|
@@ -246,90 +239,19 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
| 246 |
return None
|
| 247 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
| 248 |
|
| 249 |
-
# ——— new function ———
|
| 250 |
-
async def get_transcript_via_whisper_inference(video_url: str) -> Optional[str]:
|
| 251 |
-
"""
|
| 252 |
-
Fallback YT 4: Download audio via yt-dlp and transcribe with HF's hosted Whisper.
|
| 253 |
-
"""
|
| 254 |
-
# 1) download best audio to a temp file
|
| 255 |
-
tmp_f = None
|
| 256 |
-
try:
|
| 257 |
-
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
| 258 |
-
tmp_f = tmp.name
|
| 259 |
-
tmp.close()
|
| 260 |
-
|
| 261 |
-
ydl_opts = {
|
| 262 |
-
"format": "bestaudio/best",
|
| 263 |
-
"outtmpl": tmp_f,
|
| 264 |
-
"quiet": True,
|
| 265 |
-
"no_warnings": True,
|
| 266 |
-
}
|
| 267 |
-
with YoutubeDL(ydl_opts) as ydl:
|
| 268 |
-
ydl.download([video_url])
|
| 269 |
-
|
| 270 |
-
# 2) call the HF inference api in a thread (it’s blocking)
|
| 271 |
-
def _transcribe():
|
| 272 |
-
result = _inference_client.audio_to_text(
|
| 273 |
-
model="openai/whisper-small",
|
| 274 |
-
inputs=open(tmp_f, "rb"),
|
| 275 |
-
)
|
| 276 |
-
# HF returns {"text": "..."}
|
| 277 |
-
return result.get("text")
|
| 278 |
-
|
| 279 |
-
transcript = await asyncio.to_thread(_transcribe)
|
| 280 |
-
if transcript and isinstance(transcript, str) and transcript.strip():
|
| 281 |
-
logger.info(f"[Fallback YT 4] Whisper inference succeeded (len {len(transcript)})")
|
| 282 |
-
return transcript.strip()
|
| 283 |
-
else:
|
| 284 |
-
logger.warning("[Fallback YT 4] Whisper inference returned empty transcript")
|
| 285 |
-
return None
|
| 286 |
-
|
| 287 |
-
except Exception as e:
|
| 288 |
-
logger.error(f"[Fallback YT 4] Whisper inference error: {e}", exc_info=True)
|
| 289 |
-
return None
|
| 290 |
-
|
| 291 |
-
finally:
|
| 292 |
-
if tmp_f and os.path.exists(tmp_f):
|
| 293 |
-
try: os.remove(tmp_f)
|
| 294 |
-
except: pass
|
| 295 |
-
|
| 296 |
-
# --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
|
| 297 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
| 298 |
-
"""
|
| 299 |
-
Fetch the YouTube transcript with Apify’s default “YouTube Transcript” actor
|
| 300 |
-
(ID `1s7eXiaukVuOr4Ueg`).
|
| 301 |
-
The helper `_run_apify_actor_for_web_content` supplies the correct `run_input`
|
| 302 |
-
(including the residential proxy group), handles retries, and parses the returned
|
| 303 |
-
dataset into plain text.
|
| 304 |
-
|
| 305 |
-
Parameters
|
| 306 |
-
----------
|
| 307 |
-
video_url : str
|
| 308 |
-
Full YouTube URL supplied by the user.
|
| 309 |
-
api_token : str
|
| 310 |
-
Apify API token that has residential proxy credit.
|
| 311 |
-
|
| 312 |
-
Returns
|
| 313 |
-
-------
|
| 314 |
-
Optional[str]
|
| 315 |
-
Combined transcript text or `None` if all attempts fail.
|
| 316 |
-
"""
|
| 317 |
global APIFY_ACTOR_ID
|
| 318 |
-
|
| 319 |
-
#
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
return None
|
| 323 |
-
if not api_token:
|
| 324 |
-
logger.error("[Apify YT] API token missing.")
|
| 325 |
-
return None
|
| 326 |
-
|
| 327 |
-
logger.info(f"[Apify YT] Attempting transcript fetch via actor {APIFY_ACTOR_ID}")
|
| 328 |
return await _run_apify_actor_for_web_content(
|
| 329 |
-
url=video_url,
|
| 330 |
api_token=api_token,
|
| 331 |
actor_id=APIFY_ACTOR_ID,
|
| 332 |
-
|
|
|
|
| 333 |
)
|
| 334 |
|
| 335 |
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
|
|
@@ -356,68 +278,85 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
|
|
| 356 |
|
| 357 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
| 358 |
"""
|
| 359 |
-
Fetches YouTube transcript using multiple fallback methods:
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
4. Whisper via HF Inference (Fallback 4)
|
| 364 |
"""
|
| 365 |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
|
|
|
| 366 |
|
| 367 |
-
|
| 368 |
-
logger.error("get_youtube_transcript: No video_id provided")
|
| 369 |
-
return None
|
| 370 |
-
|
| 371 |
-
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
| 372 |
transcript_text: Optional[str] = None
|
| 373 |
|
| 374 |
-
# ---
|
| 375 |
-
logger.info("[
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
else:
|
| 382 |
-
logger.warning(
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
|
|
|
| 393 |
else:
|
| 394 |
-
logger.warning(
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
|
|
|
| 405 |
else:
|
| 406 |
-
logger.warning(
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
if
|
| 414 |
-
|
| 415 |
-
return transcript_text
|
| 416 |
-
else:
|
| 417 |
-
logger.warning(f"[Fallback YT 4] Whisper Inference failed or returned empty for {video_id}")
|
| 418 |
|
| 419 |
-
|
| 420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
return None
|
| 422 |
|
| 423 |
async def get_website_content(url: str) -> Optional[str]:
|
|
@@ -589,34 +528,20 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 589 |
"channelHandleBoolean": False,
|
| 590 |
"channelNameBoolean": False,
|
| 591 |
"datePublishedBoolean": False,
|
| 592 |
-
"relativeDateTextBoolean": False
|
| 593 |
-
# --- ADDED RESIDENTIAL PROXY CONFIG ---
|
| 594 |
-
"proxyConfiguration": {
|
| 595 |
-
"useApifyProxy": True,
|
| 596 |
-
"apifyProxyGroups": ["RESIDENTIAL"]
|
| 597 |
-
}
|
| 598 |
-
# --- END ADDED PROXY CONFIG ---
|
| 599 |
}
|
| 600 |
-
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
#
|
| 604 |
run_input = {
|
| 605 |
-
"
|
| 606 |
-
"proxyConfiguration": {
|
| 607 |
-
"useApifyProxy": True,
|
| 608 |
-
"apifyProxyGroups": ["RESIDENTIAL"],
|
| 609 |
-
},
|
| 610 |
-
"maxRetries": 5,
|
| 611 |
}
|
| 612 |
-
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({
|
| 613 |
-
# --- END ADDED PROXY CONFIG ---
|
| 614 |
-
# (Extra brace and redundant logger call removed from here)
|
| 615 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
| 616 |
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
|
| 617 |
run_input = {
|
| 618 |
"urls": [url] # <<< Assume LIST format standard here
|
| 619 |
-
# Note: Proxy config not added here by default, could be added if needed
|
| 620 |
}
|
| 621 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
| 622 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
|
@@ -624,30 +549,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 624 |
run_input = {
|
| 625 |
"startUrls": [{"url": url}], # <<< Different structure entirely
|
| 626 |
"maxCrawlPages": 1,
|
| 627 |
-
"crawlerType": "playwright:firefox"
|
| 628 |
-
# Note: Proxy config not added here by default, but Website Crawler often needs it.
|
| 629 |
-
# Example if needed:
|
| 630 |
-
# "proxyConfiguration": {
|
| 631 |
-
# "useApifyProxy": True,
|
| 632 |
-
# "apifyProxyGroups": ["RESIDENTIAL"] # Or other groups
|
| 633 |
-
# }
|
| 634 |
}
|
| 635 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
| 636 |
else:
|
| 637 |
# Fallback default input if actor ID doesn't match known ones
|
|
|
|
| 638 |
run_input = {"urls": [url]} # <<< Default to LIST
|
| 639 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
| 640 |
-
|
| 641 |
|
| 642 |
headers = {"Content-Type": "application/json"}
|
| 643 |
try:
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent
|
| 647 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
| 648 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
| 649 |
|
| 650 |
-
# --- Start of response processing ---
|
| 651 |
if response.status_code in [200, 201]:
|
| 652 |
if response.status_code == 201:
|
| 653 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
|
@@ -656,6 +575,8 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 656 |
results = response.json(); content = None
|
| 657 |
if isinstance(results, list) and len(results) > 0:
|
| 658 |
item = results[0]
|
|
|
|
|
|
|
| 659 |
content = None # Reset content
|
| 660 |
|
| 661 |
# --- REFINED PARSING LOGIC (Handles output from various actors) ---
|
|
@@ -669,20 +590,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 669 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
| 670 |
content = item["markdown"]
|
| 671 |
elif "captions" in item and isinstance(item["captions"], str):
|
|
|
|
| 672 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
| 673 |
content = item["captions"]
|
| 674 |
-
# --- MODIFIED LIST HANDLING FOR CAPTIONS ---
|
| 675 |
elif "captions" in item and isinstance(item["captions"], list):
|
| 676 |
-
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing...")
|
| 677 |
transcript_parts = []
|
| 678 |
if not item["captions"]: # Handle empty list case
|
| 679 |
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
| 680 |
else:
|
|
|
|
| 681 |
first_element = item["captions"][0]
|
| 682 |
if isinstance(first_element, str):
|
|
|
|
| 683 |
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
| 684 |
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
| 685 |
elif isinstance(first_element, dict) and "text" in first_element:
|
|
|
|
| 686 |
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
| 687 |
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
| 688 |
else:
|
|
@@ -710,15 +635,18 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 710 |
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
| 711 |
return content.strip()
|
| 712 |
else:
|
|
|
|
| 713 |
content_len = len(content) if content and isinstance(content, str) else 0
|
| 714 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
| 715 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
| 716 |
return None # Return None if no valid content found
|
| 717 |
else:
|
|
|
|
| 718 |
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
|
| 719 |
return None
|
| 720 |
# --- End of success processing logic ---
|
| 721 |
except json.JSONDecodeError:
|
|
|
|
| 722 |
raw_text = response.text
|
| 723 |
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
|
| 724 |
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
|
|
@@ -731,6 +659,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 731 |
return None
|
| 732 |
# Error handling for API call itself
|
| 733 |
elif response.status_code == 400:
|
|
|
|
| 734 |
error_msg = response.text[:200] # Default
|
| 735 |
try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
|
| 736 |
except Exception: pass
|
|
@@ -742,12 +671,22 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 742 |
return None
|
| 743 |
# Error handling for network/client issues
|
| 744 |
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
| 745 |
-
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
|
| 746 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
| 747 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
| 748 |
|
| 749 |
-
|
| 750 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
|
| 752 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
| 753 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
|
|
|
| 9 |
import traceback
|
| 10 |
import urllib.parse
|
| 11 |
from typing import Optional, Dict, Any, Tuple
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# --- Frameworks ---
|
| 14 |
from starlette.applications import Starlette
|
|
|
|
| 101 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
| 102 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
# --- Model Configurations (Specific April 2025 - Updated Order) ---
|
| 105 |
# New Model Priority:
|
| 106 |
# 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
|
|
|
|
| 239 |
return None
|
| 240 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
| 243 |
+
"""Fallback YT 1: Fetches YouTube transcript using default Apify Actor via generic function.""" # <<< UPDATED DOCSTRING & NUMBER
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
global APIFY_ACTOR_ID
|
| 245 |
+
# The specific run_input logic is now handled within _run_apify_actor_for_web_content
|
| 246 |
+
# when it detects the actor_id matches APIFY_ACTOR_ID
|
| 247 |
+
# <<< UPDATED LOG MESSAGE >>>
|
| 248 |
+
logger.debug(f"[get_transcript_via_apify - Fallback 1] Calling generic runner for URL: {video_url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
return await _run_apify_actor_for_web_content(
|
| 250 |
+
url=video_url, # Pass video_url as the 'url' parameter
|
| 251 |
api_token=api_token,
|
| 252 |
actor_id=APIFY_ACTOR_ID,
|
| 253 |
+
# <<< UPDATED ACTOR NAME IN LOGS >>>
|
| 254 |
+
actor_name="Apify YT Default (Fallback 1)"
|
| 255 |
)
|
| 256 |
|
| 257 |
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
|
|
|
|
| 278 |
|
| 279 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
| 280 |
"""
|
| 281 |
+
Fetches YouTube transcript using multiple fallback methods in the specified order:
|
| 282 |
+
1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
|
| 283 |
+
2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
|
| 284 |
+
3. Supadata API
|
|
|
|
| 285 |
"""
|
| 286 |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
| 287 |
+
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
| 288 |
|
| 289 |
+
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
transcript_text: Optional[str] = None
|
| 291 |
|
| 292 |
+
# --- Primary Method: REMOVED (youtube-transcript-api) ---
|
| 293 |
+
# logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
|
| 294 |
+
|
| 295 |
+
# --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
|
| 296 |
+
if transcript_text is None:
|
| 297 |
+
logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
|
| 298 |
+
if _apify_token_exists:
|
| 299 |
+
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
| 300 |
+
if transcript_text:
|
| 301 |
+
logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
|
| 302 |
+
return transcript_text # Return on success
|
| 303 |
+
else:
|
| 304 |
+
logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
| 305 |
else:
|
| 306 |
+
logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
|
| 307 |
+
|
| 308 |
+
# --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
|
| 309 |
+
if transcript_text is None:
|
| 310 |
+
logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
|
| 311 |
+
if _apify_token_exists:
|
| 312 |
+
transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
|
| 313 |
+
if transcript_text:
|
| 314 |
+
logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
|
| 315 |
+
return transcript_text # Return on success
|
| 316 |
+
else:
|
| 317 |
+
logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
| 318 |
else:
|
| 319 |
+
logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
|
| 320 |
+
|
| 321 |
+
# --- Fallback 3: Supadata API ---
|
| 322 |
+
if transcript_text is None:
|
| 323 |
+
logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
|
| 324 |
+
if SUPADATA_API_KEY:
|
| 325 |
+
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
| 326 |
+
if transcript_text:
|
| 327 |
+
logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
|
| 328 |
+
return transcript_text # Return on success
|
| 329 |
+
else:
|
| 330 |
+
logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
|
| 331 |
else:
|
| 332 |
+
logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
|
| 333 |
+
|
| 334 |
+
# --- Final Outcome ---
|
| 335 |
+
if transcript_text is None:
|
| 336 |
+
logger.error(f"All fallback methods failed for YT transcript: {video_id}")
|
| 337 |
+
return None # Explicitly return None if all failed
|
| 338 |
+
|
| 339 |
+
# This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
|
| 340 |
+
return transcript_text
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
+
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
| 343 |
+
"""Directly fetches URL content using httpx."""
|
| 344 |
+
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
| 345 |
+
try:
|
| 346 |
+
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
| 347 |
+
logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
|
| 348 |
+
response = await client.get(url)
|
| 349 |
+
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
|
| 350 |
+
response.raise_for_status()
|
| 351 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 352 |
+
if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
|
| 353 |
+
try: return response.text
|
| 354 |
+
except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
|
| 355 |
+
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
|
| 356 |
+
except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
|
| 357 |
+
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
|
| 358 |
+
except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
|
| 359 |
+
except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
|
| 360 |
return None
|
| 361 |
|
| 362 |
async def get_website_content(url: str) -> Optional[str]:
|
|
|
|
| 528 |
"channelHandleBoolean": False,
|
| 529 |
"channelNameBoolean": False,
|
| 530 |
"datePublishedBoolean": False,
|
| 531 |
+
"relativeDateTextBoolean": False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
}
|
| 533 |
+
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
|
| 534 |
+
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
|
| 535 |
+
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
|
| 536 |
+
# Based on the error message "Field input.urls must be string"
|
| 537 |
run_input = {
|
| 538 |
+
"urls": url # <<< STRING format needed here, not list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
}
|
| 540 |
+
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
|
|
|
|
|
|
|
| 541 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
| 542 |
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
|
| 543 |
run_input = {
|
| 544 |
"urls": [url] # <<< Assume LIST format standard here
|
|
|
|
| 545 |
}
|
| 546 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
| 547 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
|
|
|
| 549 |
run_input = {
|
| 550 |
"startUrls": [{"url": url}], # <<< Different structure entirely
|
| 551 |
"maxCrawlPages": 1,
|
| 552 |
+
"crawlerType": "playwright:firefox" # Or adjust as needed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
}
|
| 554 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
| 555 |
else:
|
| 556 |
# Fallback default input if actor ID doesn't match known ones
|
| 557 |
+
# Using the simple {"urls": [url]} format seems safest for generic text/content extractors
|
| 558 |
run_input = {"urls": [url]} # <<< Default to LIST
|
| 559 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
| 560 |
+
|
| 561 |
|
| 562 |
headers = {"Content-Type": "application/json"}
|
| 563 |
try:
|
| 564 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 565 |
+
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
|
|
|
|
| 566 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
| 567 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
| 568 |
|
| 569 |
+
# --- Start of response processing (Remains the same as before) ---
|
| 570 |
if response.status_code in [200, 201]:
|
| 571 |
if response.status_code == 201:
|
| 572 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
|
|
|
| 575 |
results = response.json(); content = None
|
| 576 |
if isinstance(results, list) and len(results) > 0:
|
| 577 |
item = results[0]
|
| 578 |
+
# Optional: Re-enable for deep debugging if needed
|
| 579 |
+
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
| 580 |
content = None # Reset content
|
| 581 |
|
| 582 |
# --- REFINED PARSING LOGIC (Handles output from various actors) ---
|
|
|
|
| 590 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
| 591 |
content = item["markdown"]
|
| 592 |
elif "captions" in item and isinstance(item["captions"], str):
|
| 593 |
+
# This case might still happen if the actor *sometimes* returns string
|
| 594 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
| 595 |
content = item["captions"]
|
| 596 |
+
# --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
|
| 597 |
elif "captions" in item and isinstance(item["captions"], list):
|
| 598 |
+
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
|
| 599 |
transcript_parts = []
|
| 600 |
if not item["captions"]: # Handle empty list case
|
| 601 |
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
| 602 |
else:
|
| 603 |
+
# Check the type of the *first* element to decide parsing strategy
|
| 604 |
first_element = item["captions"][0]
|
| 605 |
if isinstance(first_element, str):
|
| 606 |
+
# Assume list of strings (Example 1 in docs)
|
| 607 |
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
| 608 |
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
| 609 |
elif isinstance(first_element, dict) and "text" in first_element:
|
| 610 |
+
# Assume list of dictionaries (Example 2 in docs)
|
| 611 |
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
| 612 |
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
| 613 |
else:
|
|
|
|
| 635 |
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
| 636 |
return content.strip()
|
| 637 |
else:
|
| 638 |
+
# Log failure after trying all parsing methods
|
| 639 |
content_len = len(content) if content and isinstance(content, str) else 0
|
| 640 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
| 641 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
| 642 |
return None # Return None if no valid content found
|
| 643 |
else:
|
| 644 |
+
# Handle empty dataset list '[]' or non-list response
|
| 645 |
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
|
| 646 |
return None
|
| 647 |
# --- End of success processing logic ---
|
| 648 |
except json.JSONDecodeError:
|
| 649 |
+
# Check if the raw text looks like a transcript if JSON fails
|
| 650 |
raw_text = response.text
|
| 651 |
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
|
| 652 |
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
|
|
|
|
| 659 |
return None
|
| 660 |
# Error handling for API call itself
|
| 661 |
elif response.status_code == 400:
|
| 662 |
+
# Log the specific error message from the API response if available
|
| 663 |
error_msg = response.text[:200] # Default
|
| 664 |
try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
|
| 665 |
except Exception: pass
|
|
|
|
| 671 |
return None
|
| 672 |
# Error handling for network/client issues
|
| 673 |
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
| 674 |
+
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
|
| 675 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
| 676 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
| 677 |
|
| 678 |
+
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
| 679 |
+
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
| 680 |
+
global APIFY_ACTOR_ID
|
| 681 |
+
# The specific run_input logic is now handled within _run_apify_actor_for_web_content
|
| 682 |
+
# when it detects the actor_id matches APIFY_ACTOR_ID
|
| 683 |
+
logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
|
| 684 |
+
return await _run_apify_actor_for_web_content(
|
| 685 |
+
url=video_url, # Pass video_url as the 'url' parameter
|
| 686 |
+
api_token=api_token,
|
| 687 |
+
actor_id=APIFY_ACTOR_ID,
|
| 688 |
+
actor_name="Apify YT" # Keep specific name for logging clarity
|
| 689 |
+
)
|
| 690 |
|
| 691 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
| 692 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|