fmab777 commited on
Commit
5da930e
·
verified ·
1 Parent(s): e4d1389

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +116 -177
main.py CHANGED
@@ -9,9 +9,6 @@ import contextlib
9
  import traceback
10
  import urllib.parse
11
  from typing import Optional, Dict, Any, Tuple
12
- import tempfile, os, asyncio
13
- from yt_dlp import YoutubeDL
14
- from huggingface_hub import InferenceClient
15
 
16
  # --- Frameworks ---
17
  from starlette.applications import Starlette
@@ -104,10 +101,6 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
104
  RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
105
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
106
 
107
- HUGGINGFACE_HUB_TOKEN = os.environ.get("HUGGINGFACE_HUB_TOKEN")
108
- # if you don’t set a token it still works on public models (with lower rate limits)
109
- _inference_client = InferenceClient(token=HUGGINGFACE_HUB_TOKEN)
110
-
111
  # --- Model Configurations (Specific April 2025 - Updated Order) ---
112
  # New Model Priority:
113
  # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
@@ -246,90 +239,19 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
246
  return None
247
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
248
 
249
- # ——— new function ———
250
- async def get_transcript_via_whisper_inference(video_url: str) -> Optional[str]:
251
- """
252
- Fallback YT 4: Download audio via yt-dlp and transcribe with HF's hosted Whisper.
253
- """
254
- # 1) download best audio to a temp file
255
- tmp_f = None
256
- try:
257
- tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
258
- tmp_f = tmp.name
259
- tmp.close()
260
-
261
- ydl_opts = {
262
- "format": "bestaudio/best",
263
- "outtmpl": tmp_f,
264
- "quiet": True,
265
- "no_warnings": True,
266
- }
267
- with YoutubeDL(ydl_opts) as ydl:
268
- ydl.download([video_url])
269
-
270
- # 2) call the HF inference api in a thread (it’s blocking)
271
- def _transcribe():
272
- result = _inference_client.audio_to_text(
273
- model="openai/whisper-small",
274
- inputs=open(tmp_f, "rb"),
275
- )
276
- # HF returns {"text": "..."}
277
- return result.get("text")
278
-
279
- transcript = await asyncio.to_thread(_transcribe)
280
- if transcript and isinstance(transcript, str) and transcript.strip():
281
- logger.info(f"[Fallback YT 4] Whisper inference succeeded (len {len(transcript)})")
282
- return transcript.strip()
283
- else:
284
- logger.warning("[Fallback YT 4] Whisper inference returned empty transcript")
285
- return None
286
-
287
- except Exception as e:
288
- logger.error(f"[Fallback YT 4] Whisper inference error: {e}", exc_info=True)
289
- return None
290
-
291
- finally:
292
- if tmp_f and os.path.exists(tmp_f):
293
- try: os.remove(tmp_f)
294
- except: pass
295
-
296
- # --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
297
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
298
- """
299
- Fetch the YouTube transcript with Apify’s default “YouTube Transcript” actor
300
- (ID `1s7eXiaukVuOr4Ueg`).
301
- The helper `_run_apify_actor_for_web_content` supplies the correct `run_input`
302
- (including the residential proxy group), handles retries, and parses the returned
303
- dataset into plain text.
304
-
305
- Parameters
306
- ----------
307
- video_url : str
308
- Full YouTube URL supplied by the user.
309
- api_token : str
310
- Apify API token that has residential proxy credit.
311
-
312
- Returns
313
- -------
314
- Optional[str]
315
- Combined transcript text or `None` if all attempts fail.
316
- """
317
  global APIFY_ACTOR_ID
318
-
319
- # Validate arguments
320
- if not video_url:
321
- logger.error("[Apify YT] No video_url provided")
322
- return None
323
- if not api_token:
324
- logger.error("[Apify YT] API token missing.")
325
- return None
326
-
327
- logger.info(f"[Apify YT] Attempting transcript fetch via actor {APIFY_ACTOR_ID}")
328
  return await _run_apify_actor_for_web_content(
329
- url=video_url,
330
  api_token=api_token,
331
  actor_id=APIFY_ACTOR_ID,
332
- actor_name="Apify YT Default (Fallback 1)"
 
333
  )
334
 
335
  async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
@@ -356,68 +278,85 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
356
 
357
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
358
  """
359
- Fetches YouTube transcript using multiple fallback methods:
360
- 1. Apify Default Actor (Fallback 1)
361
- 2. Apify Structured Actor (Fallback 2)
362
- 3. Supadata API (Fallback 3)
363
- 4. Whisper via HF Inference (Fallback 4)
364
  """
365
  global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
 
366
 
367
- if not video_id:
368
- logger.error("get_youtube_transcript: No video_id provided")
369
- return None
370
-
371
- logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
372
  transcript_text: Optional[str] = None
373
 
374
- # --- Fallback 1: Apify Default Actor ---
375
- logger.info("[Fallback YT 1] Trying Apify Default Actor")
376
- if _apify_token_exists:
377
- transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
378
- if transcript_text:
379
- logger.info(f"[Fallback YT 1] Success via Apify Default Actor for {video_url}")
380
- return transcript_text
 
 
 
 
 
 
381
  else:
382
- logger.warning(f"[Fallback YT 1] Apify Default Actor failed or returned no content for {video_url}")
383
- else:
384
- logger.warning("[Fallback YT 1] APIFY_API_TOKEN unavailable. Skipping Apify Default Actor.")
385
-
386
- # --- Fallback 2: Apify Structured Actor ---
387
- logger.info("[Fallback YT 2] Trying Apify Structured Actor")
388
- if _apify_token_exists:
389
- transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
390
- if transcript_text:
391
- logger.info(f"[Fallback YT 2] Success via Apify Structured Actor for {video_url}")
392
- return transcript_text
 
393
  else:
394
- logger.warning(f"[Fallback YT 2] Apify Structured Actor failed or returned no content for {video_url}")
395
- else:
396
- logger.warning("[Fallback YT 2] APIFY_API_TOKEN unavailable. Skipping Apify Structured Actor.")
397
-
398
- # --- Fallback 3: Supadata API ---
399
- logger.info("[Fallback YT 3] Trying Supadata API")
400
- if SUPADATA_API_KEY:
401
- transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
402
- if transcript_text:
403
- logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}")
404
- return transcript_text
 
405
  else:
406
- logger.warning(f"[Fallback YT 3] Supadata failed or returned no content for {video_id}")
407
- else:
408
- logger.warning("[Fallback YT 3] SUPADATA_API_KEY unavailable. Skipping Supadata API.")
409
-
410
- # --- Fallback 4: Whisper via HF Inference ---
411
- logger.info("[Fallback YT 4] Trying audio transcription via Whisper Inference API")
412
- transcript_text = await get_transcript_via_whisper_inference(video_url)
413
- if transcript_text:
414
- logger.info(f"[Fallback YT 4] Success via Whisper Inference for {video_id}")
415
- return transcript_text
416
- else:
417
- logger.warning(f"[Fallback YT 4] Whisper Inference failed or returned empty for {video_id}")
418
 
419
- # --- All methods failed ---
420
- logger.error(f"All fallback methods failed for YT transcript: {video_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  return None
422
 
423
  async def get_website_content(url: str) -> Optional[str]:
@@ -589,34 +528,20 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
589
  "channelHandleBoolean": False,
590
  "channelNameBoolean": False,
591
  "datePublishedBoolean": False,
592
- "relativeDateTextBoolean": False,
593
- # --- ADDED RESIDENTIAL PROXY CONFIG ---
594
- "proxyConfiguration": {
595
- "useApifyProxy": True,
596
- "apifyProxyGroups": ["RESIDENTIAL"]
597
- }
598
- # --- END ADDED PROXY CONFIG ---
599
  }
600
- logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
601
- # --- CORRECTED BLOCK START (Ensure this 'elif' has same indentation as the 'if' above) ---
602
- elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
603
- # Input for the Structured YT extractor actor must use a single "url" string
604
  run_input = {
605
- "url": url,
606
- "proxyConfiguration": {
607
- "useApifyProxy": True,
608
- "apifyProxyGroups": ["RESIDENTIAL"],
609
- },
610
- "maxRetries": 5,
611
  }
612
- logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({actor_id}) with Residential Proxy")
613
- # --- END ADDED PROXY CONFIG ---
614
- # (Extra brace and redundant logger call removed from here)
615
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
616
  # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
617
  run_input = {
618
  "urls": [url] # <<< Assume LIST format standard here
619
- # Note: Proxy config not added here by default, could be added if needed
620
  }
621
  logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
622
  elif actor_id == APIFY_CRAWLER_ACTOR_ID:
@@ -624,30 +549,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
624
  run_input = {
625
  "startUrls": [{"url": url}], # <<< Different structure entirely
626
  "maxCrawlPages": 1,
627
- "crawlerType": "playwright:firefox", # Or adjust as needed
628
- # Note: Proxy config not added here by default, but Website Crawler often needs it.
629
- # Example if needed:
630
- # "proxyConfiguration": {
631
- # "useApifyProxy": True,
632
- # "apifyProxyGroups": ["RESIDENTIAL"] # Or other groups
633
- # }
634
  }
635
  logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
636
  else:
637
  # Fallback default input if actor ID doesn't match known ones
 
638
  run_input = {"urls": [url]} # <<< Default to LIST
639
  logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
640
- # --- END OF if/elif/else block for run_input ---
641
 
642
  headers = {"Content-Type": "application/json"}
643
  try:
644
- # Increased timeout for potentially longer residential proxy connections/actor runs
645
- async with httpx.AsyncClient(timeout=180.0) as client:
646
- logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent
647
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
648
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
649
 
650
- # --- Start of response processing ---
651
  if response.status_code in [200, 201]:
652
  if response.status_code == 201:
653
  logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
@@ -656,6 +575,8 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
656
  results = response.json(); content = None
657
  if isinstance(results, list) and len(results) > 0:
658
  item = results[0]
 
 
659
  content = None # Reset content
660
 
661
  # --- REFINED PARSING LOGIC (Handles output from various actors) ---
@@ -669,20 +590,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
669
  logger.info(f"{log_prefix} Found text content in 'markdown' field.")
670
  content = item["markdown"]
671
  elif "captions" in item and isinstance(item["captions"], str):
 
672
  logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
673
  content = item["captions"]
674
- # --- MODIFIED LIST HANDLING FOR CAPTIONS ---
675
  elif "captions" in item and isinstance(item["captions"], list):
676
- logger.info(f"{log_prefix} Found 'captions' field as a list. Processing...")
677
  transcript_parts = []
678
  if not item["captions"]: # Handle empty list case
679
  logger.warning(f"{log_prefix} 'captions' field is an empty list.")
680
  else:
 
681
  first_element = item["captions"][0]
682
  if isinstance(first_element, str):
 
683
  logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
684
  transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
685
  elif isinstance(first_element, dict) and "text" in first_element:
 
686
  logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
687
  transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
688
  else:
@@ -710,15 +635,18 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
710
  logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
711
  return content.strip()
712
  else:
 
713
  content_len = len(content) if content and isinstance(content, str) else 0
714
  item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
715
  logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
716
  return None # Return None if no valid content found
717
  else:
 
718
  logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
719
  return None
720
  # --- End of success processing logic ---
721
  except json.JSONDecodeError:
 
722
  raw_text = response.text
723
  if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
724
  logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
@@ -731,6 +659,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
731
  return None
732
  # Error handling for API call itself
733
  elif response.status_code == 400:
 
734
  error_msg = response.text[:200] # Default
735
  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
736
  except Exception: pass
@@ -742,12 +671,22 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
742
  return None
743
  # Error handling for network/client issues
744
  except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
745
- except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
746
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
747
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
748
 
749
- # ... (Rest of your main.py code below this function) ...
750
-
 
 
 
 
 
 
 
 
 
 
751
 
752
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
753
  """Fallback 4: Fetches website content using Apify Website Content Crawler."""
 
9
  import traceback
10
  import urllib.parse
11
  from typing import Optional, Dict, Any, Tuple
 
 
 
12
 
13
  # --- Frameworks ---
14
  from starlette.applications import Starlette
 
101
  RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
102
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
103
 
 
 
 
 
104
  # --- Model Configurations (Specific April 2025 - Updated Order) ---
105
  # New Model Priority:
106
  # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
 
239
  return None
240
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
243
+ """Fallback YT 1: Fetches YouTube transcript using default Apify Actor via generic function.""" # <<< UPDATED DOCSTRING & NUMBER
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  global APIFY_ACTOR_ID
245
+ # The specific run_input logic is now handled within _run_apify_actor_for_web_content
246
+ # when it detects the actor_id matches APIFY_ACTOR_ID
247
+ # <<< UPDATED LOG MESSAGE >>>
248
+ logger.debug(f"[get_transcript_via_apify - Fallback 1] Calling generic runner for URL: {video_url}")
 
 
 
 
 
 
249
  return await _run_apify_actor_for_web_content(
250
+ url=video_url, # Pass video_url as the 'url' parameter
251
  api_token=api_token,
252
  actor_id=APIFY_ACTOR_ID,
253
+ # <<< UPDATED ACTOR NAME IN LOGS >>>
254
+ actor_name="Apify YT Default (Fallback 1)"
255
  )
256
 
257
  async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
 
278
 
279
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
280
  """
281
+ Fetches YouTube transcript using multiple fallback methods in the specified order:
282
+ 1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
283
+ 2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
284
+ 3. Supadata API
 
285
  """
286
  global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
287
+ if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
288
 
289
+ logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
 
 
 
 
290
  transcript_text: Optional[str] = None
291
 
292
+ # --- Primary Method: REMOVED (youtube-transcript-api) ---
293
+ # logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
294
+
295
+ # --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
296
+ if transcript_text is None:
297
+ logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
298
+ if _apify_token_exists:
299
+ transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
300
+ if transcript_text:
301
+ logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
302
+ return transcript_text # Return on success
303
+ else:
304
+ logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
305
  else:
306
+ logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
307
+
308
+ # --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
309
+ if transcript_text is None:
310
+ logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
311
+ if _apify_token_exists:
312
+ transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
313
+ if transcript_text:
314
+ logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
315
+ return transcript_text # Return on success
316
+ else:
317
+ logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
318
  else:
319
+ logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
320
+
321
+ # --- Fallback 3: Supadata API ---
322
+ if transcript_text is None:
323
+ logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
324
+ if SUPADATA_API_KEY:
325
+ transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
326
+ if transcript_text:
327
+ logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
328
+ return transcript_text # Return on success
329
+ else:
330
+ logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
331
  else:
332
+ logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
333
+
334
+ # --- Final Outcome ---
335
+ if transcript_text is None:
336
+ logger.error(f"All fallback methods failed for YT transcript: {video_id}")
337
+ return None # Explicitly return None if all failed
338
+
339
+ # This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
340
+ return transcript_text
 
 
 
341
 
342
+ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
343
+ """Directly fetches URL content using httpx."""
344
+ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
345
+ try:
346
+ async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
347
+ logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
348
+ response = await client.get(url)
349
+ logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
350
+ response.raise_for_status()
351
+ content_type = response.headers.get('content-type', '').lower()
352
+ if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
353
+ try: return response.text
354
+ except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
355
+ except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
356
+ except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
357
+ except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
358
+ except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
359
+ except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
360
  return None
361
 
362
  async def get_website_content(url: str) -> Optional[str]:
 
528
  "channelHandleBoolean": False,
529
  "channelNameBoolean": False,
530
  "datePublishedBoolean": False,
531
+ "relativeDateTextBoolean": False
 
 
 
 
 
 
532
  }
533
+ logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
534
+ elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
535
+ # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
536
+ # Based on the error message "Field input.urls must be string"
537
  run_input = {
538
+ "urls": url # <<< STRING format needed here, not list
 
 
 
 
 
539
  }
540
+ logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
 
 
541
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
542
  # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
543
  run_input = {
544
  "urls": [url] # <<< Assume LIST format standard here
 
545
  }
546
  logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
547
  elif actor_id == APIFY_CRAWLER_ACTOR_ID:
 
549
  run_input = {
550
  "startUrls": [{"url": url}], # <<< Different structure entirely
551
  "maxCrawlPages": 1,
552
+ "crawlerType": "playwright:firefox" # Or adjust as needed
 
 
 
 
 
 
553
  }
554
  logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
555
  else:
556
  # Fallback default input if actor ID doesn't match known ones
557
+ # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
558
  run_input = {"urls": [url]} # <<< Default to LIST
559
  logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
560
+
561
 
562
  headers = {"Content-Type": "application/json"}
563
  try:
564
+ async with httpx.AsyncClient(timeout=120.0) as client:
565
+ logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
 
566
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
567
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
568
 
569
+ # --- Start of response processing (Remains the same as before) ---
570
  if response.status_code in [200, 201]:
571
  if response.status_code == 201:
572
  logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
 
575
  results = response.json(); content = None
576
  if isinstance(results, list) and len(results) > 0:
577
  item = results[0]
578
+ # Optional: Re-enable for deep debugging if needed
579
+ # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
580
  content = None # Reset content
581
 
582
  # --- REFINED PARSING LOGIC (Handles output from various actors) ---
 
590
  logger.info(f"{log_prefix} Found text content in 'markdown' field.")
591
  content = item["markdown"]
592
  elif "captions" in item and isinstance(item["captions"], str):
593
+ # This case might still happen if the actor *sometimes* returns string
594
  logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
595
  content = item["captions"]
596
+ # --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
597
  elif "captions" in item and isinstance(item["captions"], list):
598
+ logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
599
  transcript_parts = []
600
  if not item["captions"]: # Handle empty list case
601
  logger.warning(f"{log_prefix} 'captions' field is an empty list.")
602
  else:
603
+ # Check the type of the *first* element to decide parsing strategy
604
  first_element = item["captions"][0]
605
  if isinstance(first_element, str):
606
+ # Assume list of strings (Example 1 in docs)
607
  logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
608
  transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
609
  elif isinstance(first_element, dict) and "text" in first_element:
610
+ # Assume list of dictionaries (Example 2 in docs)
611
  logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
612
  transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
613
  else:
 
635
  logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
636
  return content.strip()
637
  else:
638
+ # Log failure after trying all parsing methods
639
  content_len = len(content) if content and isinstance(content, str) else 0
640
  item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
641
  logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
642
  return None # Return None if no valid content found
643
  else:
644
+ # Handle empty dataset list '[]' or non-list response
645
  logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
646
  return None
647
  # --- End of success processing logic ---
648
  except json.JSONDecodeError:
649
+ # Check if the raw text looks like a transcript if JSON fails
650
  raw_text = response.text
651
  if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
652
  logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
 
659
  return None
660
  # Error handling for API call itself
661
  elif response.status_code == 400:
662
+ # Log the specific error message from the API response if available
663
  error_msg = response.text[:200] # Default
664
  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
665
  except Exception: pass
 
671
  return None
672
  # Error handling for network/client issues
673
  except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
674
+ except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
675
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
676
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
677
 
678
+ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
679
+ """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
680
+ global APIFY_ACTOR_ID
681
+ # The specific run_input logic is now handled within _run_apify_actor_for_web_content
682
+ # when it detects the actor_id matches APIFY_ACTOR_ID
683
+ logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
684
+ return await _run_apify_actor_for_web_content(
685
+ url=video_url, # Pass video_url as the 'url' parameter
686
+ api_token=api_token,
687
+ actor_id=APIFY_ACTOR_ID,
688
+ actor_name="Apify YT" # Keep specific name for logging clarity
689
+ )
690
 
691
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
692
  """Fallback 4: Fetches website content using Apify Website Content Crawler."""