Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -9,9 +9,6 @@ import contextlib
|
|
9 |
import traceback
|
10 |
import urllib.parse
|
11 |
from typing import Optional, Dict, Any, Tuple
|
12 |
-
import tempfile, os, asyncio
|
13 |
-
from yt_dlp import YoutubeDL
|
14 |
-
from huggingface_hub import InferenceClient
|
15 |
|
16 |
# --- Frameworks ---
|
17 |
from starlette.applications import Starlette
|
@@ -104,10 +101,6 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
|
104 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
105 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
106 |
|
107 |
-
HUGGINGFACE_HUB_TOKEN = os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
108 |
-
# if you don’t set a token it still works on public models (with lower rate limits)
|
109 |
-
_inference_client = InferenceClient(token=HUGGINGFACE_HUB_TOKEN)
|
110 |
-
|
111 |
# --- Model Configurations (Specific April 2025 - Updated Order) ---
|
112 |
# New Model Priority:
|
113 |
# 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
|
@@ -246,90 +239,19 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
246 |
return None
|
247 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
248 |
|
249 |
-
# ——— new function ———
|
250 |
-
async def get_transcript_via_whisper_inference(video_url: str) -> Optional[str]:
|
251 |
-
"""
|
252 |
-
Fallback YT 4: Download audio via yt-dlp and transcribe with HF's hosted Whisper.
|
253 |
-
"""
|
254 |
-
# 1) download best audio to a temp file
|
255 |
-
tmp_f = None
|
256 |
-
try:
|
257 |
-
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
258 |
-
tmp_f = tmp.name
|
259 |
-
tmp.close()
|
260 |
-
|
261 |
-
ydl_opts = {
|
262 |
-
"format": "bestaudio/best",
|
263 |
-
"outtmpl": tmp_f,
|
264 |
-
"quiet": True,
|
265 |
-
"no_warnings": True,
|
266 |
-
}
|
267 |
-
with YoutubeDL(ydl_opts) as ydl:
|
268 |
-
ydl.download([video_url])
|
269 |
-
|
270 |
-
# 2) call the HF inference api in a thread (it’s blocking)
|
271 |
-
def _transcribe():
|
272 |
-
result = _inference_client.audio_to_text(
|
273 |
-
model="openai/whisper-small",
|
274 |
-
inputs=open(tmp_f, "rb"),
|
275 |
-
)
|
276 |
-
# HF returns {"text": "..."}
|
277 |
-
return result.get("text")
|
278 |
-
|
279 |
-
transcript = await asyncio.to_thread(_transcribe)
|
280 |
-
if transcript and isinstance(transcript, str) and transcript.strip():
|
281 |
-
logger.info(f"[Fallback YT 4] Whisper inference succeeded (len {len(transcript)})")
|
282 |
-
return transcript.strip()
|
283 |
-
else:
|
284 |
-
logger.warning("[Fallback YT 4] Whisper inference returned empty transcript")
|
285 |
-
return None
|
286 |
-
|
287 |
-
except Exception as e:
|
288 |
-
logger.error(f"[Fallback YT 4] Whisper inference error: {e}", exc_info=True)
|
289 |
-
return None
|
290 |
-
|
291 |
-
finally:
|
292 |
-
if tmp_f and os.path.exists(tmp_f):
|
293 |
-
try: os.remove(tmp_f)
|
294 |
-
except: pass
|
295 |
-
|
296 |
-
# --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
|
297 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
298 |
-
"""
|
299 |
-
Fetch the YouTube transcript with Apify’s default “YouTube Transcript” actor
|
300 |
-
(ID `1s7eXiaukVuOr4Ueg`).
|
301 |
-
The helper `_run_apify_actor_for_web_content` supplies the correct `run_input`
|
302 |
-
(including the residential proxy group), handles retries, and parses the returned
|
303 |
-
dataset into plain text.
|
304 |
-
|
305 |
-
Parameters
|
306 |
-
----------
|
307 |
-
video_url : str
|
308 |
-
Full YouTube URL supplied by the user.
|
309 |
-
api_token : str
|
310 |
-
Apify API token that has residential proxy credit.
|
311 |
-
|
312 |
-
Returns
|
313 |
-
-------
|
314 |
-
Optional[str]
|
315 |
-
Combined transcript text or `None` if all attempts fail.
|
316 |
-
"""
|
317 |
global APIFY_ACTOR_ID
|
318 |
-
|
319 |
-
#
|
320 |
-
|
321 |
-
|
322 |
-
return None
|
323 |
-
if not api_token:
|
324 |
-
logger.error("[Apify YT] API token missing.")
|
325 |
-
return None
|
326 |
-
|
327 |
-
logger.info(f"[Apify YT] Attempting transcript fetch via actor {APIFY_ACTOR_ID}")
|
328 |
return await _run_apify_actor_for_web_content(
|
329 |
-
url=video_url,
|
330 |
api_token=api_token,
|
331 |
actor_id=APIFY_ACTOR_ID,
|
332 |
-
|
|
|
333 |
)
|
334 |
|
335 |
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
|
@@ -356,68 +278,85 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
|
|
356 |
|
357 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
358 |
"""
|
359 |
-
Fetches YouTube transcript using multiple fallback methods:
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
4. Whisper via HF Inference (Fallback 4)
|
364 |
"""
|
365 |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
|
|
366 |
|
367 |
-
|
368 |
-
logger.error("get_youtube_transcript: No video_id provided")
|
369 |
-
return None
|
370 |
-
|
371 |
-
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
372 |
transcript_text: Optional[str] = None
|
373 |
|
374 |
-
# ---
|
375 |
-
logger.info("[
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
else:
|
382 |
-
logger.warning(
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
|
|
393 |
else:
|
394 |
-
logger.warning(
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
|
|
405 |
else:
|
406 |
-
logger.warning(
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
if
|
414 |
-
|
415 |
-
return transcript_text
|
416 |
-
else:
|
417 |
-
logger.warning(f"[Fallback YT 4] Whisper Inference failed or returned empty for {video_id}")
|
418 |
|
419 |
-
|
420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
return None
|
422 |
|
423 |
async def get_website_content(url: str) -> Optional[str]:
|
@@ -589,34 +528,20 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
589 |
"channelHandleBoolean": False,
|
590 |
"channelNameBoolean": False,
|
591 |
"datePublishedBoolean": False,
|
592 |
-
"relativeDateTextBoolean": False
|
593 |
-
# --- ADDED RESIDENTIAL PROXY CONFIG ---
|
594 |
-
"proxyConfiguration": {
|
595 |
-
"useApifyProxy": True,
|
596 |
-
"apifyProxyGroups": ["RESIDENTIAL"]
|
597 |
-
}
|
598 |
-
# --- END ADDED PROXY CONFIG ---
|
599 |
}
|
600 |
-
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})
|
601 |
-
|
602 |
-
|
603 |
-
#
|
604 |
run_input = {
|
605 |
-
"
|
606 |
-
"proxyConfiguration": {
|
607 |
-
"useApifyProxy": True,
|
608 |
-
"apifyProxyGroups": ["RESIDENTIAL"],
|
609 |
-
},
|
610 |
-
"maxRetries": 5,
|
611 |
}
|
612 |
-
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({
|
613 |
-
# --- END ADDED PROXY CONFIG ---
|
614 |
-
# (Extra brace and redundant logger call removed from here)
|
615 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
616 |
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
|
617 |
run_input = {
|
618 |
"urls": [url] # <<< Assume LIST format standard here
|
619 |
-
# Note: Proxy config not added here by default, could be added if needed
|
620 |
}
|
621 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
622 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
@@ -624,30 +549,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
624 |
run_input = {
|
625 |
"startUrls": [{"url": url}], # <<< Different structure entirely
|
626 |
"maxCrawlPages": 1,
|
627 |
-
"crawlerType": "playwright:firefox"
|
628 |
-
# Note: Proxy config not added here by default, but Website Crawler often needs it.
|
629 |
-
# Example if needed:
|
630 |
-
# "proxyConfiguration": {
|
631 |
-
# "useApifyProxy": True,
|
632 |
-
# "apifyProxyGroups": ["RESIDENTIAL"] # Or other groups
|
633 |
-
# }
|
634 |
}
|
635 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
636 |
else:
|
637 |
# Fallback default input if actor ID doesn't match known ones
|
|
|
638 |
run_input = {"urls": [url]} # <<< Default to LIST
|
639 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
640 |
-
|
641 |
|
642 |
headers = {"Content-Type": "application/json"}
|
643 |
try:
|
644 |
-
|
645 |
-
|
646 |
-
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent
|
647 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
648 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
649 |
|
650 |
-
# --- Start of response processing ---
|
651 |
if response.status_code in [200, 201]:
|
652 |
if response.status_code == 201:
|
653 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
@@ -656,6 +575,8 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
656 |
results = response.json(); content = None
|
657 |
if isinstance(results, list) and len(results) > 0:
|
658 |
item = results[0]
|
|
|
|
|
659 |
content = None # Reset content
|
660 |
|
661 |
# --- REFINED PARSING LOGIC (Handles output from various actors) ---
|
@@ -669,20 +590,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
669 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
670 |
content = item["markdown"]
|
671 |
elif "captions" in item and isinstance(item["captions"], str):
|
|
|
672 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
673 |
content = item["captions"]
|
674 |
-
# --- MODIFIED LIST HANDLING FOR CAPTIONS ---
|
675 |
elif "captions" in item and isinstance(item["captions"], list):
|
676 |
-
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing...")
|
677 |
transcript_parts = []
|
678 |
if not item["captions"]: # Handle empty list case
|
679 |
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
680 |
else:
|
|
|
681 |
first_element = item["captions"][0]
|
682 |
if isinstance(first_element, str):
|
|
|
683 |
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
684 |
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
685 |
elif isinstance(first_element, dict) and "text" in first_element:
|
|
|
686 |
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
687 |
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
688 |
else:
|
@@ -710,15 +635,18 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
710 |
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
711 |
return content.strip()
|
712 |
else:
|
|
|
713 |
content_len = len(content) if content and isinstance(content, str) else 0
|
714 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
715 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
716 |
return None # Return None if no valid content found
|
717 |
else:
|
|
|
718 |
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
|
719 |
return None
|
720 |
# --- End of success processing logic ---
|
721 |
except json.JSONDecodeError:
|
|
|
722 |
raw_text = response.text
|
723 |
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
|
724 |
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
|
@@ -731,6 +659,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
731 |
return None
|
732 |
# Error handling for API call itself
|
733 |
elif response.status_code == 400:
|
|
|
734 |
error_msg = response.text[:200] # Default
|
735 |
try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
|
736 |
except Exception: pass
|
@@ -742,12 +671,22 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
742 |
return None
|
743 |
# Error handling for network/client issues
|
744 |
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
745 |
-
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
|
746 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
747 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
748 |
|
749 |
-
|
750 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
751 |
|
752 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
753 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
|
|
9 |
import traceback
|
10 |
import urllib.parse
|
11 |
from typing import Optional, Dict, Any, Tuple
|
|
|
|
|
|
|
12 |
|
13 |
# --- Frameworks ---
|
14 |
from starlette.applications import Starlette
|
|
|
101 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
102 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
103 |
|
|
|
|
|
|
|
|
|
104 |
# --- Model Configurations (Specific April 2025 - Updated Order) ---
|
105 |
# New Model Priority:
|
106 |
# 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
|
|
|
239 |
return None
|
240 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
243 |
+
"""Fallback YT 1: Fetches YouTube transcript using default Apify Actor via generic function.""" # <<< UPDATED DOCSTRING & NUMBER
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
global APIFY_ACTOR_ID
|
245 |
+
# The specific run_input logic is now handled within _run_apify_actor_for_web_content
|
246 |
+
# when it detects the actor_id matches APIFY_ACTOR_ID
|
247 |
+
# <<< UPDATED LOG MESSAGE >>>
|
248 |
+
logger.debug(f"[get_transcript_via_apify - Fallback 1] Calling generic runner for URL: {video_url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
return await _run_apify_actor_for_web_content(
|
250 |
+
url=video_url, # Pass video_url as the 'url' parameter
|
251 |
api_token=api_token,
|
252 |
actor_id=APIFY_ACTOR_ID,
|
253 |
+
# <<< UPDATED ACTOR NAME IN LOGS >>>
|
254 |
+
actor_name="Apify YT Default (Fallback 1)"
|
255 |
)
|
256 |
|
257 |
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
|
|
|
278 |
|
279 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
280 |
"""
|
281 |
+
Fetches YouTube transcript using multiple fallback methods in the specified order:
|
282 |
+
1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
|
283 |
+
2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
|
284 |
+
3. Supadata API
|
|
|
285 |
"""
|
286 |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
287 |
+
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
288 |
|
289 |
+
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
|
|
|
|
|
|
|
|
|
290 |
transcript_text: Optional[str] = None
|
291 |
|
292 |
+
# --- Primary Method: REMOVED (youtube-transcript-api) ---
|
293 |
+
# logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
|
294 |
+
|
295 |
+
# --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
|
296 |
+
if transcript_text is None:
|
297 |
+
logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
|
298 |
+
if _apify_token_exists:
|
299 |
+
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
300 |
+
if transcript_text:
|
301 |
+
logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
|
302 |
+
return transcript_text # Return on success
|
303 |
+
else:
|
304 |
+
logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
305 |
else:
|
306 |
+
logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
|
307 |
+
|
308 |
+
# --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
|
309 |
+
if transcript_text is None:
|
310 |
+
logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
|
311 |
+
if _apify_token_exists:
|
312 |
+
transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
|
313 |
+
if transcript_text:
|
314 |
+
logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
|
315 |
+
return transcript_text # Return on success
|
316 |
+
else:
|
317 |
+
logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
|
318 |
else:
|
319 |
+
logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
|
320 |
+
|
321 |
+
# --- Fallback 3: Supadata API ---
|
322 |
+
if transcript_text is None:
|
323 |
+
logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
|
324 |
+
if SUPADATA_API_KEY:
|
325 |
+
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
326 |
+
if transcript_text:
|
327 |
+
logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
|
328 |
+
return transcript_text # Return on success
|
329 |
+
else:
|
330 |
+
logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
|
331 |
else:
|
332 |
+
logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
|
333 |
+
|
334 |
+
# --- Final Outcome ---
|
335 |
+
if transcript_text is None:
|
336 |
+
logger.error(f"All fallback methods failed for YT transcript: {video_id}")
|
337 |
+
return None # Explicitly return None if all failed
|
338 |
+
|
339 |
+
# This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
|
340 |
+
return transcript_text
|
|
|
|
|
|
|
341 |
|
342 |
+
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
343 |
+
"""Directly fetches URL content using httpx."""
|
344 |
+
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
345 |
+
try:
|
346 |
+
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
347 |
+
logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
|
348 |
+
response = await client.get(url)
|
349 |
+
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
|
350 |
+
response.raise_for_status()
|
351 |
+
content_type = response.headers.get('content-type', '').lower()
|
352 |
+
if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
|
353 |
+
try: return response.text
|
354 |
+
except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
|
355 |
+
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
|
356 |
+
except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
|
357 |
+
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
|
358 |
+
except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
|
359 |
+
except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
|
360 |
return None
|
361 |
|
362 |
async def get_website_content(url: str) -> Optional[str]:
|
|
|
528 |
"channelHandleBoolean": False,
|
529 |
"channelNameBoolean": False,
|
530 |
"datePublishedBoolean": False,
|
531 |
+
"relativeDateTextBoolean": False
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
}
|
533 |
+
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
|
534 |
+
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
|
535 |
+
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
|
536 |
+
# Based on the error message "Field input.urls must be string"
|
537 |
run_input = {
|
538 |
+
"urls": url # <<< STRING format needed here, not list
|
|
|
|
|
|
|
|
|
|
|
539 |
}
|
540 |
+
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
|
|
|
|
|
541 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
542 |
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
|
543 |
run_input = {
|
544 |
"urls": [url] # <<< Assume LIST format standard here
|
|
|
545 |
}
|
546 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
547 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
|
|
549 |
run_input = {
|
550 |
"startUrls": [{"url": url}], # <<< Different structure entirely
|
551 |
"maxCrawlPages": 1,
|
552 |
+
"crawlerType": "playwright:firefox" # Or adjust as needed
|
|
|
|
|
|
|
|
|
|
|
|
|
553 |
}
|
554 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
555 |
else:
|
556 |
# Fallback default input if actor ID doesn't match known ones
|
557 |
+
# Using the simple {"urls": [url]} format seems safest for generic text/content extractors
|
558 |
run_input = {"urls": [url]} # <<< Default to LIST
|
559 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
560 |
+
|
561 |
|
562 |
headers = {"Content-Type": "application/json"}
|
563 |
try:
|
564 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
565 |
+
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
|
|
|
566 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
567 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
568 |
|
569 |
+
# --- Start of response processing (Remains the same as before) ---
|
570 |
if response.status_code in [200, 201]:
|
571 |
if response.status_code == 201:
|
572 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
|
|
575 |
results = response.json(); content = None
|
576 |
if isinstance(results, list) and len(results) > 0:
|
577 |
item = results[0]
|
578 |
+
# Optional: Re-enable for deep debugging if needed
|
579 |
+
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
580 |
content = None # Reset content
|
581 |
|
582 |
# --- REFINED PARSING LOGIC (Handles output from various actors) ---
|
|
|
590 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
591 |
content = item["markdown"]
|
592 |
elif "captions" in item and isinstance(item["captions"], str):
|
593 |
+
# This case might still happen if the actor *sometimes* returns string
|
594 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
595 |
content = item["captions"]
|
596 |
+
# --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
|
597 |
elif "captions" in item and isinstance(item["captions"], list):
|
598 |
+
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
|
599 |
transcript_parts = []
|
600 |
if not item["captions"]: # Handle empty list case
|
601 |
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
602 |
else:
|
603 |
+
# Check the type of the *first* element to decide parsing strategy
|
604 |
first_element = item["captions"][0]
|
605 |
if isinstance(first_element, str):
|
606 |
+
# Assume list of strings (Example 1 in docs)
|
607 |
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
608 |
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
609 |
elif isinstance(first_element, dict) and "text" in first_element:
|
610 |
+
# Assume list of dictionaries (Example 2 in docs)
|
611 |
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
612 |
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
613 |
else:
|
|
|
635 |
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
636 |
return content.strip()
|
637 |
else:
|
638 |
+
# Log failure after trying all parsing methods
|
639 |
content_len = len(content) if content and isinstance(content, str) else 0
|
640 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
641 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
642 |
return None # Return None if no valid content found
|
643 |
else:
|
644 |
+
# Handle empty dataset list '[]' or non-list response
|
645 |
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
|
646 |
return None
|
647 |
# --- End of success processing logic ---
|
648 |
except json.JSONDecodeError:
|
649 |
+
# Check if the raw text looks like a transcript if JSON fails
|
650 |
raw_text = response.text
|
651 |
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
|
652 |
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
|
|
|
659 |
return None
|
660 |
# Error handling for API call itself
|
661 |
elif response.status_code == 400:
|
662 |
+
# Log the specific error message from the API response if available
|
663 |
error_msg = response.text[:200] # Default
|
664 |
try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
|
665 |
except Exception: pass
|
|
|
671 |
return None
|
672 |
# Error handling for network/client issues
|
673 |
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
674 |
+
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
|
675 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
676 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
677 |
|
678 |
+
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
679 |
+
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
680 |
+
global APIFY_ACTOR_ID
|
681 |
+
# The specific run_input logic is now handled within _run_apify_actor_for_web_content
|
682 |
+
# when it detects the actor_id matches APIFY_ACTOR_ID
|
683 |
+
logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
|
684 |
+
return await _run_apify_actor_for_web_content(
|
685 |
+
url=video_url, # Pass video_url as the 'url' parameter
|
686 |
+
api_token=api_token,
|
687 |
+
actor_id=APIFY_ACTOR_ID,
|
688 |
+
actor_name="Apify YT" # Keep specific name for logging clarity
|
689 |
+
)
|
690 |
|
691 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
692 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|