Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on Apr 22

Commit

5da930e

verified ·

1 Parent(s): e4d1389

Update main.py

Browse files

Files changed (1) hide show

main.py +116 -177

main.py CHANGED Viewed

@@ -9,9 +9,6 @@ import contextlib
 import traceback
 import urllib.parse
 from typing import Optional, Dict, Any, Tuple
-import tempfile, os, asyncio
-from yt_dlp import YoutubeDL
-from huggingface_hub import InferenceClient
 # --- Frameworks ---
 from starlette.applications import Starlette
@@ -104,10 +101,6 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
 RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
 WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
-HUGGINGFACE_HUB_TOKEN = os.environ.get("HUGGINGFACE_HUB_TOKEN")
-# if you don’t set a token it still works on public models (with lower rate limits)
-_inference_client = InferenceClient(token=HUGGINGFACE_HUB_TOKEN)
 # --- Model Configurations (Specific April 2025 - Updated Order) ---
 # New Model Priority:
 # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
@@ -246,90 +239,19 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
         return None
     except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
-# ——— new function ———
-async def get_transcript_via_whisper_inference(video_url: str) -> Optional[str]:
-    """
-    Fallback YT 4: Download audio via yt-dlp and transcribe with HF's hosted Whisper.
-    """
-    # 1) download best audio to a temp file
-    tmp_f = None
-    try:
-        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
-        tmp_f = tmp.name
-        tmp.close()
-        ydl_opts = {
-            "format": "bestaudio/best",
-            "outtmpl": tmp_f,
-            "quiet": True,
-            "no_warnings": True,
-        }
-        with YoutubeDL(ydl_opts) as ydl:
-            ydl.download([video_url])
-        # 2) call the HF inference api in a thread (it’s blocking)
-        def _transcribe():
-            result = _inference_client.audio_to_text(
-                model="openai/whisper-small",
-                inputs=open(tmp_f, "rb"),
-            )
-            # HF returns {"text": "..."}
-            return result.get("text")
-        transcript = await asyncio.to_thread(_transcribe)
-        if transcript and isinstance(transcript, str) and transcript.strip():
-            logger.info(f"[Fallback YT 4] Whisper inference succeeded (len {len(transcript)})")
-            return transcript.strip()
-        else:
-            logger.warning("[Fallback YT 4] Whisper inference returned empty transcript")
-            return None
-    except Exception as e:
-        logger.error(f"[Fallback YT 4] Whisper inference error: {e}", exc_info=True)
-        return None
-    finally:
-        if tmp_f and os.path.exists(tmp_f):
-            try: os.remove(tmp_f)
-            except: pass
-# --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
 async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
-    """
-    Fetch the YouTube transcript with Apify’s default “YouTube Transcript” actor
-    (ID `1s7eXiaukVuOr4Ueg`).
-    The helper `_run_apify_actor_for_web_content` supplies the correct `run_input`
-    (including the residential proxy group), handles retries, and parses the returned
-    dataset into plain text.
-    Parameters
-    ----------
-    video_url : str
-        Full YouTube URL supplied by the user.
-    api_token : str
-        Apify API token that has residential proxy credit.
-    Returns
-    -------
-    Optional[str]
-        Combined transcript text or `None` if all attempts fail.
-    """
     global APIFY_ACTOR_ID
-    # Validate arguments
-    if not video_url:
-        logger.error("[Apify YT] No video_url provided")
-        return None
-    if not api_token:
-        logger.error("[Apify YT] API token missing.")
-        return None
-    logger.info(f"[Apify YT] Attempting transcript fetch via actor {APIFY_ACTOR_ID}")
     return await _run_apify_actor_for_web_content(
-        url=video_url,
         api_token=api_token,
         actor_id=APIFY_ACTOR_ID,
-        actor_name="Apify YT Default (Fallback 1)"
     )
 async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
@@ -356,68 +278,85 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
     """
-    Fetches YouTube transcript using multiple fallback methods:
-     1. Apify Default Actor (Fallback 1)
-     2. Apify Structured Actor (Fallback 2)
-     3. Supadata API            (Fallback 3)
-     4. Whisper via HF Inference (Fallback 4)
     """
     global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
-    if not video_id:
-        logger.error("get_youtube_transcript: No video_id provided")
-        return None
-    logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
     transcript_text: Optional[str] = None
-    # --- Fallback 1: Apify Default Actor ---
-    logger.info("[Fallback YT 1] Trying Apify Default Actor")
-    if _apify_token_exists:
-        transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
-        if transcript_text:
-            logger.info(f"[Fallback YT 1] Success via Apify Default Actor for {video_url}")
-            return transcript_text
         else:
-            logger.warning(f"[Fallback YT 1] Apify Default Actor failed or returned no content for {video_url}")
-    else:
-        logger.warning("[Fallback YT 1] APIFY_API_TOKEN unavailable. Skipping Apify Default Actor.")
-    # --- Fallback 2: Apify Structured Actor ---
-    logger.info("[Fallback YT 2] Trying Apify Structured Actor")
-    if _apify_token_exists:
-        transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
-        if transcript_text:
-            logger.info(f"[Fallback YT 2] Success via Apify Structured Actor for {video_url}")
-            return transcript_text
         else:
-            logger.warning(f"[Fallback YT 2] Apify Structured Actor failed or returned no content for {video_url}")
-    else:
-        logger.warning("[Fallback YT 2] APIFY_API_TOKEN unavailable. Skipping Apify Structured Actor.")
-    # --- Fallback 3: Supadata API ---
-    logger.info("[Fallback YT 3] Trying Supadata API")
-    if SUPADATA_API_KEY:
-        transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
-        if transcript_text:
-            logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}")
-            return transcript_text
         else:
-            logger.warning(f"[Fallback YT 3] Supadata failed or returned no content for {video_id}")
-    else:
-        logger.warning("[Fallback YT 3] SUPADATA_API_KEY unavailable. Skipping Supadata API.")
-    # --- Fallback 4: Whisper via HF Inference ---
-    logger.info("[Fallback YT 4] Trying audio transcription via Whisper Inference API")
-    transcript_text = await get_transcript_via_whisper_inference(video_url)
-    if transcript_text:
-        logger.info(f"[Fallback YT 4] Success via Whisper Inference for {video_id}")
-        return transcript_text
-    else:
-        logger.warning(f"[Fallback YT 4] Whisper Inference failed or returned empty for {video_id}")
-    # --- All methods failed ---
-    logger.error(f"All fallback methods failed for YT transcript: {video_id}")
     return None
 async def get_website_content(url: str) -> Optional[str]:
@@ -589,34 +528,20 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
             "channelHandleBoolean": False,
             "channelNameBoolean": False,
             "datePublishedBoolean": False,
-            "relativeDateTextBoolean": False,
-            # --- ADDED RESIDENTIAL PROXY CONFIG ---
-            "proxyConfiguration": {
-                "useApifyProxy": True,
-                "apifyProxyGroups": ["RESIDENTIAL"]
-            }
-            # --- END ADDED PROXY CONFIG ---
         }
-        logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
-    # --- CORRECTED BLOCK START (Ensure this 'elif' has same indentation as the 'if' above) ---
-        elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
-        # Input for the Structured YT extractor actor must use a single "url" string
         run_input = {
-            "url": url,
-            "proxyConfiguration": {
-                "useApifyProxy": True,
-                "apifyProxyGroups": ["RESIDENTIAL"],
-            },
-            "maxRetries": 5,
         }
-        logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({actor_id}) with Residential Proxy")
-        # --- END ADDED PROXY CONFIG ---
-        # (Extra brace and redundant logger call removed from here)
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
         # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
         run_input = {
             "urls": [url] # <<< Assume LIST format standard here
-            # Note: Proxy config not added here by default, could be added if needed
         }
         logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
     elif actor_id == APIFY_CRAWLER_ACTOR_ID:
@@ -624,30 +549,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
          run_input = {
             "startUrls": [{"url": url}], # <<< Different structure entirely
             "maxCrawlPages": 1,
-            "crawlerType": "playwright:firefox", # Or adjust as needed
-            # Note: Proxy config not added here by default, but Website Crawler often needs it.
-            # Example if needed:
-            # "proxyConfiguration": {
-            #     "useApifyProxy": True,
-            #     "apifyProxyGroups": ["RESIDENTIAL"] # Or other groups
-            # }
          }
          logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
     else:
         # Fallback default input if actor ID doesn't match known ones
         run_input = {"urls": [url]} # <<< Default to LIST
         logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
-    # --- END OF if/elif/else block for run_input ---
     headers = {"Content-Type": "application/json"}
     try:
-        # Increased timeout for potentially longer residential proxy connections/actor runs
-        async with httpx.AsyncClient(timeout=180.0) as client:
-            logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
             logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
-            # --- Start of response processing ---
             if response.status_code in [200, 201]:
                 if response.status_code == 201:
                     logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
@@ -656,6 +575,8 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                     results = response.json(); content = None
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
                         content = None # Reset content
                         # --- REFINED PARSING LOGIC (Handles output from various actors) ---
@@ -669,20 +590,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                             logger.info(f"{log_prefix} Found text content in 'markdown' field.")
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
                             logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
                             content = item["captions"]
-                        # --- MODIFIED LIST HANDLING FOR CAPTIONS ---
                         elif "captions" in item and isinstance(item["captions"], list):
-                            logger.info(f"{log_prefix} Found 'captions' field as a list. Processing...")
                             transcript_parts = []
                             if not item["captions"]: # Handle empty list case
                                 logger.warning(f"{log_prefix} 'captions' field is an empty list.")
                             else:
                                 first_element = item["captions"][0]
                                 if isinstance(first_element, str):
                                     logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
                                     transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
                                 elif isinstance(first_element, dict) and "text" in first_element:
                                     logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
                                     transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
                                 else:
@@ -710,15 +635,18 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                             logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
                             return content.strip()
                         else:
                              content_len = len(content) if content and isinstance(content, str) else 0
                              item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
                              logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
                              return None # Return None if no valid content found
                     else:
                         logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
                     raw_text = response.text
                     if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
                         logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
@@ -731,6 +659,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                     return None
             # Error handling for API call itself
             elif response.status_code == 400:
                  error_msg = response.text[:200] # Default
                  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
                  except Exception: pass
@@ -742,12 +671,22 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                 return None
     # Error handling for network/client issues
     except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
-    except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
     except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
     except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
-# ... (Rest of your main.py code below this function) ...
 async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
     """Fallback 4: Fetches website content using Apify Website Content Crawler."""

 import traceback
 import urllib.parse
 from typing import Optional, Dict, Any, Tuple
 # --- Frameworks ---
 from starlette.applications import Starlette
 RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
 WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
 # --- Model Configurations (Specific April 2025 - Updated Order) ---
 # New Model Priority:
 # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
         return None
     except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
 async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
+    """Fallback YT 1: Fetches YouTube transcript using default Apify Actor via generic function.""" # <<< UPDATED DOCSTRING & NUMBER
     global APIFY_ACTOR_ID
+    # The specific run_input logic is now handled within _run_apify_actor_for_web_content
+    # when it detects the actor_id matches APIFY_ACTOR_ID
+    # <<< UPDATED LOG MESSAGE >>>
+    logger.debug(f"[get_transcript_via_apify - Fallback 1] Calling generic runner for URL: {video_url}")
     return await _run_apify_actor_for_web_content(
+        url=video_url,  # Pass video_url as the 'url' parameter
         api_token=api_token,
         actor_id=APIFY_ACTOR_ID,
+        # <<< UPDATED ACTOR NAME IN LOGS >>>
+        actor_name="Apify YT Default (Fallback 1)"
     )
 async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]:
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
     """
+    Fetches YouTube transcript using multiple fallback methods in the specified order:
+    1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
+    2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
+    3. Supadata API
     """
     global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
+    if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
+    logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
     transcript_text: Optional[str] = None
+    # --- Primary Method: REMOVED (youtube-transcript-api) ---
+    # logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
+    # --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
+    if transcript_text is None:
+        logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
+        if _apify_token_exists:
+            transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
+            if transcript_text:
+                logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
+                return transcript_text # Return on success
+            else:
+                logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
         else:
+            logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
+    # --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
+    if transcript_text is None:
+        logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
+        if _apify_token_exists:
+            transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
+            if transcript_text:
+                logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
+                return transcript_text # Return on success
+            else:
+                logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
         else:
+            logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
+    # --- Fallback 3: Supadata API ---
+    if transcript_text is None:
+        logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
+        if SUPADATA_API_KEY:
+            transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
+            if transcript_text:
+                logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
+                return transcript_text # Return on success
+            else:
+                logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
         else:
+            logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
+    # --- Final Outcome ---
+    if transcript_text is None:
+        logger.error(f"All fallback methods failed for YT transcript: {video_id}")
+        return None # Explicitly return None if all failed
+    # This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
+    return transcript_text
+async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
+    """Directly fetches URL content using httpx."""
+    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
+    try:
+        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
+            logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
+            response = await client.get(url)
+            logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
+            response.raise_for_status()
+            content_type = response.headers.get('content-type', '').lower()
+            if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
+            try: return response.text
+            except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
+    except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
+    except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
+    except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
+    except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
+    except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
     return None
 async def get_website_content(url: str) -> Optional[str]:
             "channelHandleBoolean": False,
             "channelNameBoolean": False,
             "datePublishedBoolean": False,
+            "relativeDateTextBoolean": False
         }
+        logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
+    elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
+        # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
+        # Based on the error message "Field input.urls must be string"
         run_input = {
+            "urls": url # <<< STRING format needed here, not list
         }
+        logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
         # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
         run_input = {
             "urls": [url] # <<< Assume LIST format standard here
         }
         logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
     elif actor_id == APIFY_CRAWLER_ACTOR_ID:
          run_input = {
             "startUrls": [{"url": url}], # <<< Different structure entirely
             "maxCrawlPages": 1,
+            "crawlerType": "playwright:firefox" # Or adjust as needed
          }
          logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
     else:
         # Fallback default input if actor ID doesn't match known ones
+        # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
         run_input = {"urls": [url]} # <<< Default to LIST
         logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
     headers = {"Content-Type": "application/json"}
     try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
             logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
+            # --- Start of response processing (Remains the same as before) ---
             if response.status_code in [200, 201]:
                 if response.status_code == 201:
                     logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
                     results = response.json(); content = None
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
+                        # Optional: Re-enable for deep debugging if needed
+                        # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
                         content = None # Reset content
                         # --- REFINED PARSING LOGIC (Handles output from various actors) ---
                             logger.info(f"{log_prefix} Found text content in 'markdown' field.")
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
+                            # This case might still happen if the actor *sometimes* returns string
                             logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
                             content = item["captions"]
+                        # --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
                         elif "captions" in item and isinstance(item["captions"], list):
+                            logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
                             transcript_parts = []
                             if not item["captions"]: # Handle empty list case
                                 logger.warning(f"{log_prefix} 'captions' field is an empty list.")
                             else:
+                                # Check the type of the *first* element to decide parsing strategy
                                 first_element = item["captions"][0]
                                 if isinstance(first_element, str):
+                                    # Assume list of strings (Example 1 in docs)
                                     logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
                                     transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
                                 elif isinstance(first_element, dict) and "text" in first_element:
+                                    # Assume list of dictionaries (Example 2 in docs)
                                     logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
                                     transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
                                 else:
                             logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
                             return content.strip()
                         else:
+                             # Log failure after trying all parsing methods
                              content_len = len(content) if content and isinstance(content, str) else 0
                              item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
                              logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
                              return None # Return None if no valid content found
                     else:
+                        # Handle empty dataset list '[]' or non-list response
                         logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
+                    # Check if the raw text looks like a transcript if JSON fails
                     raw_text = response.text
                     if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
                         logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
                     return None
             # Error handling for API call itself
             elif response.status_code == 400:
+                 # Log the specific error message from the API response if available
                  error_msg = response.text[:200] # Default
                  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
                  except Exception: pass
                 return None
     # Error handling for network/client issues
     except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
+    except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
     except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
     except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
+async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
+    """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
+    global APIFY_ACTOR_ID
+    # The specific run_input logic is now handled within _run_apify_actor_for_web_content
+    # when it detects the actor_id matches APIFY_ACTOR_ID
+    logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
+    return await _run_apify_actor_for_web_content(
+        url=video_url,  # Pass video_url as the 'url' parameter
+        api_token=api_token,
+        actor_id=APIFY_ACTOR_ID,
+        actor_name="Apify YT" # Keep specific name for logging clarity
+    )
 async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
     """Fallback 4: Fetches website content using Apify Website Content Crawler."""