Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on Apr 22

Commit

f25ac38

verified ·

1 Parent(s): f406730

Update main.py

Browse files

Files changed (1) hide show

main.py +117 -78

main.py CHANGED Viewed

@@ -9,6 +9,9 @@ import contextlib
 import traceback
 import urllib.parse
 from typing import Optional, Dict, Any, Tuple
 # --- Frameworks ---
 from starlette.applications import Starlette
@@ -101,6 +104,10 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
 RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
 WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
 # --- Model Configurations (Specific April 2025 - Updated Order) ---
 # New Model Priority:
 # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
@@ -239,6 +246,53 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
         return None
     except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
 # --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
 async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
     """
@@ -302,85 +356,68 @@ async def get_transcript_via_apify_structured_extractor(video_url: str, api_toke
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
     """
-    Fetches YouTube transcript using multiple fallback methods in the specified order:
-    1. Apify Default Actor (1s7eXiaukVuOr4Ueg)
-    2. Apify Structured Actor (gpjTCWkGZS1lHc9pR)
-    3. Supadata API
     """
     global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
-    if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
-    logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order")
     transcript_text: Optional[str] = None
-    # --- Primary Method: REMOVED (youtube-transcript-api) ---
-    # logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed
-    # --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) ---
-    if transcript_text is None:
-        logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1
-        if _apify_token_exists:
-            transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
-            if transcript_text:
-                logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER
-                return transcript_text # Return on success
-            else:
-                logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
         else:
-            logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER
-    # --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) ---
-    if transcript_text is None:
-        logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2
-        if _apify_token_exists:
-            transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
-            if transcript_text:
-                logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER
-                return transcript_text # Return on success
-            else:
-                logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER
         else:
-            logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER
-    # --- Fallback 3: Supadata API ---
-    if transcript_text is None:
-        logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3
-        if SUPADATA_API_KEY:
-            transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
-            if transcript_text:
-                logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER
-                return transcript_text # Return on success
-            else:
-                logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER
         else:
-            logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER
-    # --- Final Outcome ---
-    if transcript_text is None:
-        logger.error(f"All fallback methods failed for YT transcript: {video_id}")
-        return None # Explicitly return None if all failed
-    # This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen).
-    return transcript_text
-async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
-    """Directly fetches URL content using httpx."""
-    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
-    try:
-        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
-            logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
-            response = await client.get(url)
-            logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
-            response.raise_for_status()
-            content_type = response.headers.get('content-type', '').lower()
-            if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
-            try: return response.text
-            except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
-    except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
-    except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
-    except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
-    except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
-    except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
     return None
 async def get_website_content(url: str) -> Optional[str]:
@@ -562,14 +599,16 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
         }
         logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
     elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
-        # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
-        run_input = {
-            "urls": url, # <<< STRING format needed here, not list
-             # --- ADDED RESIDENTIAL PROXY CONFIG ---
-            "proxyConfiguration": {
-                "useApifyProxy": True,
-                "apifyProxyGroups": ["RESIDENTIAL"]
-            }
             # --- END ADDED PROXY CONFIG ---
         }
         logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")

 import traceback
 import urllib.parse
 from typing import Optional, Dict, Any, Tuple
+import tempfile, os, asyncio
+from yt_dlp import YoutubeDL
+from huggingface_hub import InferenceClient
 # --- Frameworks ---
 from starlette.applications import Starlette
 RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
 WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
+HUGGINGFACE_HUB_TOKEN = os.environ.get("HUGGINGFACE_HUB_TOKEN")
+# if you don’t set a token it still works on public models (with lower rate limits)
+_inference_client = InferenceClient(token=HUGGINGFACE_HUB_TOKEN)
 # --- Model Configurations (Specific April 2025 - Updated Order) ---
 # New Model Priority:
 # 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant)
         return None
     except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
+# ——— new function ———
+async def get_transcript_via_whisper_inference(video_url: str) -> Optional[str]:
+    """
+    Fallback YT 4: Download audio via yt-dlp and transcribe with HF's hosted Whisper.
+    """
+    # 1) download best audio to a temp file
+    tmp_f = None
+    try:
+        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+        tmp_f = tmp.name
+        tmp.close()
+        ydl_opts = {
+            "format": "bestaudio/best",
+            "outtmpl": tmp_f,
+            "quiet": True,
+            "no_warnings": True,
+        }
+        with YoutubeDL(ydl_opts) as ydl:
+            ydl.download([video_url])
+        # 2) call the HF inference api in a thread (it’s blocking)
+        def _transcribe():
+            result = _inference_client.audio_to_text(
+                model="openai/whisper-small",
+                inputs=open(tmp_f, "rb"),
+            )
+            # HF returns {"text": "..."}
+            return result.get("text")
+        transcript = await asyncio.to_thread(_transcribe)
+        if transcript and isinstance(transcript, str) and transcript.strip():
+            logger.info(f"[Fallback YT 4] Whisper inference succeeded (len {len(transcript)})")
+            return transcript.strip()
+        else:
+            logger.warning("[Fallback YT 4] Whisper inference returned empty transcript")
+            return None
+    except Exception as e:
+        logger.error(f"[Fallback YT 4] Whisper inference error: {e}", exc_info=True)
+        return None
+    finally:
+        if tmp_f and os.path.exists(tmp_f):
+            try: os.remove(tmp_f)
+            except: pass
 # --- YouTube fallback 1: Apify default transcript actor (unique definition – delete any duplicates) ---
 async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
     """
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
     """
+    Fetches YouTube transcript using multiple fallback methods:
+     1. Apify Default Actor (Fallback 1)
+     2. Apify Structured Actor (Fallback 2)
+     3. Supadata API            (Fallback 3)
+     4. Whisper via HF Inference (Fallback 4)
     """
     global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
+    if not video_id:
+        logger.error("get_youtube_transcript: No video_id provided")
+        return None
+    logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
     transcript_text: Optional[str] = None
+    # --- Fallback 1: Apify Default Actor ---
+    logger.info("[Fallback YT 1] Trying Apify Default Actor")
+    if _apify_token_exists:
+        transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
+        if transcript_text:
+            logger.info(f"[Fallback YT 1] Success via Apify Default Actor for {video_url}")
+            return transcript_text
         else:
+            logger.warning(f"[Fallback YT 1] Apify Default Actor failed or returned no content for {video_url}")
+    else:
+        logger.warning("[Fallback YT 1] APIFY_API_TOKEN unavailable. Skipping Apify Default Actor.")
+    # --- Fallback 2: Apify Structured Actor ---
+    logger.info("[Fallback YT 2] Trying Apify Structured Actor")
+    if _apify_token_exists:
+        transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN)
+        if transcript_text:
+            logger.info(f"[Fallback YT 2] Success via Apify Structured Actor for {video_url}")
+            return transcript_text
         else:
+            logger.warning(f"[Fallback YT 2] Apify Structured Actor failed or returned no content for {video_url}")
+    else:
+        logger.warning("[Fallback YT 2] APIFY_API_TOKEN unavailable. Skipping Apify Structured Actor.")
+    # --- Fallback 3: Supadata API ---
+    logger.info("[Fallback YT 3] Trying Supadata API")
+    if SUPADATA_API_KEY:
+        transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
+        if transcript_text:
+            logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}")
+            return transcript_text
         else:
+            logger.warning(f"[Fallback YT 3] Supadata failed or returned no content for {video_id}")
+    else:
+        logger.warning("[Fallback YT 3] SUPADATA_API_KEY unavailable. Skipping Supadata API.")
+    # --- Fallback 4: Whisper via HF Inference ---
+    logger.info("[Fallback YT 4] Trying audio transcription via Whisper Inference API")
+    transcript_text = await get_transcript_via_whisper_inference(video_url)
+    if transcript_text:
+        logger.info(f"[Fallback YT 4] Success via Whisper Inference for {video_id}")
+        return transcript_text
+    else:
+        logger.warning(f"[Fallback YT 4] Whisper Inference failed or returned empty for {video_id}")
+    # --- All methods failed ---
+    logger.error(f"All fallback methods failed for YT transcript: {video_id}")
     return None
 async def get_website_content(url: str) -> Optional[str]:
         }
         logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
     elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
+    # Input specific to the Structured YT Actor – wrap in a list even for a single URL
+    run_input = {
+        "urls": [ url ],                         # ← wrap your URL in a list
+        "proxyConfiguration": {
+            "useApifyProxy": True,
+            "apifyProxyGroups": ["RESIDENTIAL"],
+        },
+        "maxRetries": 5,
+    }
+    logger.debug(f"{log_prefix} Using list input format for Structured YT Actor ({actor_id})")
             # --- END ADDED PROXY CONFIG ---
         }
         logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")