Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on Apr 4

Commit

fc34c28

verified ·

1 Parent(s): 08e0d82

Update main.py

Browse files

Files changed (1) hide show

main.py +54 -42

main.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# main.py (Changing Default Apify Actor ID)
 import os
 import re
 import logging
@@ -71,7 +71,7 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
 WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
 OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
-# *** FIX: Change default actor back to pocesar/youtube-scraper ***
 APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "pocesar/youtube-scraper")
 if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
@@ -84,7 +84,7 @@ if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found.
 logger.info("Secret loading and configuration check finished.")
 logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}")
-logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}") # Will show pocesar now if default
 _apify_token_exists = bool(APIFY_API_TOKEN)
@@ -160,24 +160,24 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
         return None
     except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
 async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
     """Fetches YouTube transcript using Apify REST API (async start + poll + dataset fetch)."""
-    global APIFY_ACTOR_ID # Uses the globally defined actor ID
     if not video_url: logger.error("[Apify Async] No video_url provided"); return None
     if not api_token: logger.error("[Apify Async] API token missing."); return None
     logger.info(f"[Apify Async] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
     start_run_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/runs"
     params_base = {"token": api_token}
-    # Adjust payload based on the specific actor (pocesar vs karamelo) if necessary
-    # This payload seems generic enough for pocesar/youtube-scraper too, might need adjustment
     payload = {
-        "startUrls": [{"url": video_url}], # pocesar often uses startUrls
-        # "urls": [video_url], # karamelo used urls
-        "proxyConfiguration": {"useApifyProxy": True}, # Common setting
-        "subtitles": True, # Explicitly request if pocesar supports it like this
-        "maxResultStreams": 0, "maxResults": 1, # Limit results
-        # Check pocesar/youtube-scraper docs for exact options
     }
     headers = {"Content-Type": "application/json"}
@@ -189,7 +189,9 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
     try:
         async with httpx.AsyncClient(timeout=30.0) as client:
             # 1. Start the run
-            logger.debug(f"[Apify Async] Starting actor run for {video_url} using actor {APIFY_ACTOR_ID}")
             response_start = await client.post(start_run_endpoint, headers=headers, params=params_base, json=payload)
             logger.debug(f"[Apify Async] Start run status: {response_start.status_code}")
@@ -202,10 +204,7 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
                     logger.info(f"[Apify Async] Run started. Run ID: {run_id}, Dataset ID: {dataset_id}")
                 except Exception as e: logger.error(f"[Apify Async] Error parsing start run response: {e}. Response: {response_start.text[:200]}", exc_info=True); return None
             else:
-                # Log specific error if available from Apify response
-                error_info = ""
-                try: error_info = response_start.json().get("error", {}).get("message", "")
-                except Exception: pass
                 logger.error(f"[Apify Async] Failed to start run. Status: {response_start.status_code}. Error: {error_info} Resp: {response_start.text[:200]}")
                 return None
@@ -229,7 +228,7 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
             # 3. Fetch dataset items
             logger.info(f"[Apify Async] Run {run_id} succeeded. Fetching items from dataset {dataset_id}")
             dataset_endpoint = f"https://api.apify.com/v2/datasets/{dataset_id}/items"
-            params_dataset = {"token": api_token, "format": "json", "limit": 5}
             response_dataset = await client.get(dataset_endpoint, params=params_dataset)
             logger.debug(f"[Apify Async] Dataset fetch status: {response_dataset.status_code}")
             response_dataset.raise_for_status()
@@ -237,26 +236,26 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
             results = response_dataset.json()
             if isinstance(results, list) and len(results) > 0:
                 item = results[0]
-                # --- Adapt parsing for pocesar/youtube-scraper ---
-                # This actor often puts the transcript directly under a 'subtitles' key,
-                # possibly as a single string or sometimes structured.
-                content = item.get("subtitles") # Check 'subtitles' first
-                if not content: content = item.get("text") # Fallback check
-                if not content: content = item.get("transcript") # Another fallback
-                # If subtitles is a list of dicts (like karamelo's output sometimes)
-                if isinstance(content, list) and len(content) > 0 and isinstance(content[0], dict) and 'lines' in content[0]:
-                     logger.info("[Apify Async] Processing structured subtitles format.")
-                     content = " ".join(line.get("text", "") for line in content[0].get('lines', []) if line.get("text"))
-                elif isinstance(content, list): # Handle simple list of strings if found
-                     logger.info("[Apify Async] Processing list of strings format.")
-                     content = " ".join(content)
-                # Final check if we have a non-empty string
                 if content and isinstance(content, str):
                     logger.info(f"[Apify Async] Success via ASYNC REST for {video_url}. Length: {len(content)}")
                     return content.strip()
-                else: logger.warning(f"[Apify Async] Dataset item found but transcript empty/not found for {video_url}. Item: {item}"); return None
             else: logger.warning(f"[Apify Async] Dataset {dataset_id} was empty for {video_url}. Response: {results}"); return None
     except httpx.TimeoutException as e: logger.error(f"[Apify Async] Timeout during API interaction for {video_url}: {e}"); return None
@@ -264,7 +263,7 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
     except httpx.RequestError as e: logger.error(f"[Apify Async] Request error during API interaction for {video_url}: {e}"); return None
     except Exception as e: logger.error(f"[Apify Async] Unexpected error during Apify Async REST call for {video_url}: {e}", exc_info=True); return None
-# (get_youtube_transcript, get_website_content, get_website_content_via_api, generate_summary remain the same)
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
     global SUPADATA_API_KEY, APIFY_API_TOKEN
     if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
@@ -346,6 +345,8 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
     except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
     except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
 async def generate_summary(text: str, summary_type: str) -> str:
     global OPENROUTER_API_KEY, OPENROUTER_MODEL
     logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
@@ -356,13 +357,19 @@ async def generate_summary(text: str, summary_type: str) -> str:
     if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
     full_prompt = f"{prompt}\n\n{text}"
     headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" }; payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] }; openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
-    api_timeout = 180.0; response = None
     try:
-        async with httpx.AsyncClient(timeout=api_timeout) as client:
-            logger.info(f"Sending request to OpenRouter ({OPENROUTER_MODEL}) with timeout {api_timeout}s...")
             response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
             if response: logger.info(f"Received response from OpenRouter. Status code: {response.status_code}")
             else: logger.error("No response received from OpenRouter after await completed (unexpected)."); return "Sorry, communication with the AI service failed unexpectedly."
             if response.status_code == 200:
                 try:
                     data = response.json()
@@ -381,13 +388,19 @@ async def generate_summary(text: str, summary_type: str) -> str:
             elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Exceeded (429)."); return "Sorry, AI model is busy. Try again."
             elif response.status_code == 500: logger.error(f"OpenRouter Internal Server Error (500). Resp:{response.text[:500]}"); return "Sorry, AI service internal error."
             else: logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp:{response.text[:500]}"); return f"Sorry, AI service returned unexpected status ({response.status_code})."
-    except httpx.TimeoutException: logger.error(f"Timeout error ({api_timeout}s) connecting to OpenRouter API."); return f"Sorry, the request to the AI model timed out after {api_timeout} seconds. The content might be too long or the service busy. Please try again later or with shorter content."
     except httpx.RequestError as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was an error connecting to the AI model service."
     except Exception as e:
         logger.error(f"Unexpected error in generate_summary (OpenRouter request phase): {e}", exc_info=True)
         if response: logger.error(f"--> Last response status before error: {response.status_code}")
         return "Sorry, an unexpected error occurred while trying to generate the summary."
 # --- Background Task Processing ---
 # (process_summary_task remains the same)
 async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
@@ -521,6 +534,7 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
     logger.error("Exception while handling an update:", exc_info=context.error)
 # --- Bot Setup ---
 async def setup_bot_config() -> Application:
     logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
     if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
@@ -576,7 +590,6 @@ async def lifespan(app: Starlette):
         else: logger.info("PTB application not initialized or failed.")
         logger.info("ASGI Lifespan: Shutdown complete.")
 # --- Starlette Route Handlers ---
 # (health_check and telegram_webhook remain the same)
 async def health_check(request: Request) -> PlainTextResponse:
@@ -601,7 +614,6 @@ async def telegram_webhook(request: Request) -> Response:
     except json.JSONDecodeError: logger.error("Webhook invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
     except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK despite error
 # --- Create Starlette ASGI Application ---
 # (app definition remains the same)
 app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )

+# main.py (Updating Apify Actor ID format, simplifying payload, explicit read timeout)
 import os
 import re
 import logging
 WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
 OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
+# *** FIX: Try actor ID with '/' instead of '~' ***
 APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "pocesar/youtube-scraper")
 if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
 logger.info("Secret loading and configuration check finished.")
 logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}")
+logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}")
 _apify_token_exists = bool(APIFY_API_TOKEN)
         return None
     except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
 async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
     """Fetches YouTube transcript using Apify REST API (async start + poll + dataset fetch)."""
+    global APIFY_ACTOR_ID
     if not video_url: logger.error("[Apify Async] No video_url provided"); return None
     if not api_token: logger.error("[Apify Async] API token missing."); return None
     logger.info(f"[Apify Async] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
+    # Use the standard /runs endpoint to start
     start_run_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/runs"
     params_base = {"token": api_token}
+    # *** FIX: Simplified payload, ensure it matches pocesar/youtube-scraper expectations ***
     payload = {
+        "startUrls": [{"url": video_url}],
+        # "proxyConfiguration": {"useApifyProxy": True}, # Keep proxy if needed
+        # Remove other less critical options for simplicity during debugging
+        # "subtitles": True, # Might be implicitly true or named differently
+        # "maxResults": 1
     }
     headers = {"Content-Type": "application/json"}
     try:
         async with httpx.AsyncClient(timeout=30.0) as client:
             # 1. Start the run
+            # *** FIX: Log exact URL, headers (partial), and payload before sending ***
+            log_headers = {k: v for k, v in headers.items() if k.lower() != 'authorization'} # Don't log token implicitly
+            logger.debug(f"[Apify Async] POST Request Details:\nURL: {start_run_endpoint}\nParams: {params_base}\nHeaders: {log_headers}\nPayload: {json.dumps(payload)}")
             response_start = await client.post(start_run_endpoint, headers=headers, params=params_base, json=payload)
             logger.debug(f"[Apify Async] Start run status: {response_start.status_code}")
                     logger.info(f"[Apify Async] Run started. Run ID: {run_id}, Dataset ID: {dataset_id}")
                 except Exception as e: logger.error(f"[Apify Async] Error parsing start run response: {e}. Response: {response_start.text[:200]}", exc_info=True); return None
             else:
+                error_info = ""; try: error_info = response_start.json().get("error", {}).get("message", "") except Exception: pass
                 logger.error(f"[Apify Async] Failed to start run. Status: {response_start.status_code}. Error: {error_info} Resp: {response_start.text[:200]}")
                 return None
             # 3. Fetch dataset items
             logger.info(f"[Apify Async] Run {run_id} succeeded. Fetching items from dataset {dataset_id}")
             dataset_endpoint = f"https://api.apify.com/v2/datasets/{dataset_id}/items"
+            params_dataset = {"token": api_token, "format": "json", "limit": 5} # Limit items fetched
             response_dataset = await client.get(dataset_endpoint, params=params_dataset)
             logger.debug(f"[Apify Async] Dataset fetch status: {response_dataset.status_code}")
             response_dataset.raise_for_status()
             results = response_dataset.json()
             if isinstance(results, list) and len(results) > 0:
                 item = results[0]
+                # Try different keys based on pocesar/youtube-scraper output possibilities
+                content = item.get("subtitles") # Primary key for this actor usually
+                if not content: content = item.get("text")
+                if not content: content = item.get("transcript")
+                if isinstance(content, list): # Handle if it returns list of lines/segments
+                     logger.info("[Apify Async] Processing list format from subtitles.")
+                     # Check if list of dicts with 'text' key
+                     if len(content) > 0 and isinstance(content[0], dict) and 'text' in content[0]:
+                         content = " ".join(line.get("text", "") for line in content if line.get("text"))
+                     # Check if list of simple strings
+                     elif len(content) > 0 and isinstance(content[0], str):
+                         content = " ".join(content)
+                     else: # Unknown list format
+                          content = None
                 if content and isinstance(content, str):
                     logger.info(f"[Apify Async] Success via ASYNC REST for {video_url}. Length: {len(content)}")
                     return content.strip()
+                else: logger.warning(f"[Apify Async] Dataset item found but transcript empty/invalid format for {video_url}. Item: {item}"); return None
             else: logger.warning(f"[Apify Async] Dataset {dataset_id} was empty for {video_url}. Response: {results}"); return None
     except httpx.TimeoutException as e: logger.error(f"[Apify Async] Timeout during API interaction for {video_url}: {e}"); return None
     except httpx.RequestError as e: logger.error(f"[Apify Async] Request error during API interaction for {video_url}: {e}"); return None
     except Exception as e: logger.error(f"[Apify Async] Unexpected error during Apify Async REST call for {video_url}: {e}", exc_info=True); return None
+# (get_youtube_transcript, get_website_content, get_website_content_via_api remain the same)
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
     global SUPADATA_API_KEY, APIFY_API_TOKEN
     if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
     except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
     except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
+# --- Summarization Function ---
 async def generate_summary(text: str, summary_type: str) -> str:
     global OPENROUTER_API_KEY, OPENROUTER_MODEL
     logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
     if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
     full_prompt = f"{prompt}\n\n{text}"
     headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" }; payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] }; openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
+    # *** FIX: Define explicit timeouts dictionary ***
+    api_timeouts = httpx.Timeout(15.0, read=180.0, write=15.0, pool=60.0) # Connect, Read, Write, Pool
+    response = None
     try:
+        # *** FIX: Pass timeouts object to AsyncClient ***
+        async with httpx.AsyncClient(timeout=api_timeouts) as client:
+            logger.info(f"Sending request to OpenRouter ({OPENROUTER_MODEL}) with read timeout {api_timeouts.read}s...")
             response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
             if response: logger.info(f"Received response from OpenRouter. Status code: {response.status_code}")
             else: logger.error("No response received from OpenRouter after await completed (unexpected)."); return "Sorry, communication with the AI service failed unexpectedly."
+            # Process response (status checks etc.)
             if response.status_code == 200:
                 try:
                     data = response.json()
             elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Exceeded (429)."); return "Sorry, AI model is busy. Try again."
             elif response.status_code == 500: logger.error(f"OpenRouter Internal Server Error (500). Resp:{response.text[:500]}"); return "Sorry, AI service internal error."
             else: logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp:{response.text[:500]}"); return f"Sorry, AI service returned unexpected status ({response.status_code})."
+    except httpx.ReadTimeout: # Catch specific ReadTimeout
+        logger.error(f"Read Timeout error ({api_timeouts.read}s) waiting for OpenRouter API response.")
+        return f"Sorry, the request to the AI model timed out after {api_timeouts.read} seconds while waiting for a response. The content might be too long or the service busy. Please try again later or with shorter content."
+    except httpx.TimeoutException as e: # Catch other timeouts (connect, write, pool)
+        logger.error(f"Timeout error ({type(e)}) connecting to/writing to OpenRouter API: {e}")
+        return "Sorry, the request to the AI model timed out. Please try again."
     except httpx.RequestError as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was an error connecting to the AI model service."
     except Exception as e:
         logger.error(f"Unexpected error in generate_summary (OpenRouter request phase): {e}", exc_info=True)
         if response: logger.error(f"--> Last response status before error: {response.status_code}")
         return "Sorry, an unexpected error occurred while trying to generate the summary."
 # --- Background Task Processing ---
 # (process_summary_task remains the same)
 async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
     logger.error("Exception while handling an update:", exc_info=context.error)
 # --- Bot Setup ---
+# (setup_bot_config remains the same)
 async def setup_bot_config() -> Application:
     logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
     if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
         else: logger.info("PTB application not initialized or failed.")
         logger.info("ASGI Lifespan: Shutdown complete.")
 # --- Starlette Route Handlers ---
 # (health_check and telegram_webhook remain the same)
 async def health_check(request: Request) -> PlainTextResponse:
     except json.JSONDecodeError: logger.error("Webhook invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
     except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK despite error
 # --- Create Starlette ASGI Application ---
 # (app definition remains the same)
 app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )