fmab777 commited on
Commit
fc34c28
·
verified ·
1 Parent(s): 08e0d82

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +54 -42
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Changing Default Apify Actor ID)
2
  import os
3
  import re
4
  import logging
@@ -71,7 +71,7 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
71
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
72
 
73
  OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
74
- # *** FIX: Change default actor back to pocesar/youtube-scraper ***
75
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "pocesar/youtube-scraper")
76
 
77
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
@@ -84,7 +84,7 @@ if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found.
84
 
85
  logger.info("Secret loading and configuration check finished.")
86
  logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}")
87
- logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}") # Will show pocesar now if default
88
 
89
  _apify_token_exists = bool(APIFY_API_TOKEN)
90
 
@@ -160,24 +160,24 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
160
  return None
161
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
162
 
 
163
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
164
  """Fetches YouTube transcript using Apify REST API (async start + poll + dataset fetch)."""
165
- global APIFY_ACTOR_ID # Uses the globally defined actor ID
166
  if not video_url: logger.error("[Apify Async] No video_url provided"); return None
167
  if not api_token: logger.error("[Apify Async] API token missing."); return None
168
  logger.info(f"[Apify Async] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
169
 
 
170
  start_run_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/runs"
171
  params_base = {"token": api_token}
172
- # Adjust payload based on the specific actor (pocesar vs karamelo) if necessary
173
- # This payload seems generic enough for pocesar/youtube-scraper too, might need adjustment
174
  payload = {
175
- "startUrls": [{"url": video_url}], # pocesar often uses startUrls
176
- # "urls": [video_url], # karamelo used urls
177
- "proxyConfiguration": {"useApifyProxy": True}, # Common setting
178
- "subtitles": True, # Explicitly request if pocesar supports it like this
179
- "maxResultStreams": 0, "maxResults": 1, # Limit results
180
- # Check pocesar/youtube-scraper docs for exact options
181
  }
182
  headers = {"Content-Type": "application/json"}
183
 
@@ -189,7 +189,9 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
189
  try:
190
  async with httpx.AsyncClient(timeout=30.0) as client:
191
  # 1. Start the run
192
- logger.debug(f"[Apify Async] Starting actor run for {video_url} using actor {APIFY_ACTOR_ID}")
 
 
193
  response_start = await client.post(start_run_endpoint, headers=headers, params=params_base, json=payload)
194
  logger.debug(f"[Apify Async] Start run status: {response_start.status_code}")
195
 
@@ -202,10 +204,7 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
202
  logger.info(f"[Apify Async] Run started. Run ID: {run_id}, Dataset ID: {dataset_id}")
203
  except Exception as e: logger.error(f"[Apify Async] Error parsing start run response: {e}. Response: {response_start.text[:200]}", exc_info=True); return None
204
  else:
205
- # Log specific error if available from Apify response
206
- error_info = ""
207
- try: error_info = response_start.json().get("error", {}).get("message", "")
208
- except Exception: pass
209
  logger.error(f"[Apify Async] Failed to start run. Status: {response_start.status_code}. Error: {error_info} Resp: {response_start.text[:200]}")
210
  return None
211
 
@@ -229,7 +228,7 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
229
  # 3. Fetch dataset items
230
  logger.info(f"[Apify Async] Run {run_id} succeeded. Fetching items from dataset {dataset_id}")
231
  dataset_endpoint = f"https://api.apify.com/v2/datasets/{dataset_id}/items"
232
- params_dataset = {"token": api_token, "format": "json", "limit": 5}
233
  response_dataset = await client.get(dataset_endpoint, params=params_dataset)
234
  logger.debug(f"[Apify Async] Dataset fetch status: {response_dataset.status_code}")
235
  response_dataset.raise_for_status()
@@ -237,26 +236,26 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
237
  results = response_dataset.json()
238
  if isinstance(results, list) and len(results) > 0:
239
  item = results[0]
240
- # --- Adapt parsing for pocesar/youtube-scraper ---
241
- # This actor often puts the transcript directly under a 'subtitles' key,
242
- # possibly as a single string or sometimes structured.
243
- content = item.get("subtitles") # Check 'subtitles' first
244
- if not content: content = item.get("text") # Fallback check
245
- if not content: content = item.get("transcript") # Another fallback
246
-
247
- # If subtitles is a list of dicts (like karamelo's output sometimes)
248
- if isinstance(content, list) and len(content) > 0 and isinstance(content[0], dict) and 'lines' in content[0]:
249
- logger.info("[Apify Async] Processing structured subtitles format.")
250
- content = " ".join(line.get("text", "") for line in content[0].get('lines', []) if line.get("text"))
251
- elif isinstance(content, list): # Handle simple list of strings if found
252
- logger.info("[Apify Async] Processing list of strings format.")
253
- content = " ".join(content)
254
-
255
- # Final check if we have a non-empty string
256
  if content and isinstance(content, str):
257
  logger.info(f"[Apify Async] Success via ASYNC REST for {video_url}. Length: {len(content)}")
258
  return content.strip()
259
- else: logger.warning(f"[Apify Async] Dataset item found but transcript empty/not found for {video_url}. Item: {item}"); return None
260
  else: logger.warning(f"[Apify Async] Dataset {dataset_id} was empty for {video_url}. Response: {results}"); return None
261
 
262
  except httpx.TimeoutException as e: logger.error(f"[Apify Async] Timeout during API interaction for {video_url}: {e}"); return None
@@ -264,7 +263,7 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
264
  except httpx.RequestError as e: logger.error(f"[Apify Async] Request error during API interaction for {video_url}: {e}"); return None
265
  except Exception as e: logger.error(f"[Apify Async] Unexpected error during Apify Async REST call for {video_url}: {e}", exc_info=True); return None
266
 
267
- # (get_youtube_transcript, get_website_content, get_website_content_via_api, generate_summary remain the same)
268
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
269
  global SUPADATA_API_KEY, APIFY_API_TOKEN
270
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
@@ -346,6 +345,8 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
346
  except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
347
  except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
348
 
 
 
349
  async def generate_summary(text: str, summary_type: str) -> str:
350
  global OPENROUTER_API_KEY, OPENROUTER_MODEL
351
  logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
@@ -356,13 +357,19 @@ async def generate_summary(text: str, summary_type: str) -> str:
356
  if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
357
  full_prompt = f"{prompt}\n\n{text}"
358
  headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" }; payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] }; openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
359
- api_timeout = 180.0; response = None
 
 
 
 
360
  try:
361
- async with httpx.AsyncClient(timeout=api_timeout) as client:
362
- logger.info(f"Sending request to OpenRouter ({OPENROUTER_MODEL}) with timeout {api_timeout}s...")
 
363
  response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
364
  if response: logger.info(f"Received response from OpenRouter. Status code: {response.status_code}")
365
  else: logger.error("No response received from OpenRouter after await completed (unexpected)."); return "Sorry, communication with the AI service failed unexpectedly."
 
366
  if response.status_code == 200:
367
  try:
368
  data = response.json()
@@ -381,13 +388,19 @@ async def generate_summary(text: str, summary_type: str) -> str:
381
  elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Exceeded (429)."); return "Sorry, AI model is busy. Try again."
382
  elif response.status_code == 500: logger.error(f"OpenRouter Internal Server Error (500). Resp:{response.text[:500]}"); return "Sorry, AI service internal error."
383
  else: logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp:{response.text[:500]}"); return f"Sorry, AI service returned unexpected status ({response.status_code})."
384
- except httpx.TimeoutException: logger.error(f"Timeout error ({api_timeout}s) connecting to OpenRouter API."); return f"Sorry, the request to the AI model timed out after {api_timeout} seconds. The content might be too long or the service busy. Please try again later or with shorter content."
 
 
 
 
 
385
  except httpx.RequestError as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was an error connecting to the AI model service."
386
  except Exception as e:
387
  logger.error(f"Unexpected error in generate_summary (OpenRouter request phase): {e}", exc_info=True)
388
  if response: logger.error(f"--> Last response status before error: {response.status_code}")
389
  return "Sorry, an unexpected error occurred while trying to generate the summary."
390
 
 
391
  # --- Background Task Processing ---
392
  # (process_summary_task remains the same)
393
  async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
@@ -521,6 +534,7 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
521
  logger.error("Exception while handling an update:", exc_info=context.error)
522
 
523
  # --- Bot Setup ---
 
524
  async def setup_bot_config() -> Application:
525
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
526
  if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
@@ -576,7 +590,6 @@ async def lifespan(app: Starlette):
576
  else: logger.info("PTB application not initialized or failed.")
577
  logger.info("ASGI Lifespan: Shutdown complete.")
578
 
579
-
580
  # --- Starlette Route Handlers ---
581
  # (health_check and telegram_webhook remain the same)
582
  async def health_check(request: Request) -> PlainTextResponse:
@@ -601,7 +614,6 @@ async def telegram_webhook(request: Request) -> Response:
601
  except json.JSONDecodeError: logger.error("Webhook invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
602
  except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK despite error
603
 
604
-
605
  # --- Create Starlette ASGI Application ---
606
  # (app definition remains the same)
607
  app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
 
1
+ # main.py (Updating Apify Actor ID format, simplifying payload, explicit read timeout)
2
  import os
3
  import re
4
  import logging
 
71
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
72
 
73
  OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
74
+ # *** FIX: Try actor ID with '/' instead of '~' ***
75
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "pocesar/youtube-scraper")
76
 
77
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
 
84
 
85
  logger.info("Secret loading and configuration check finished.")
86
  logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}")
87
+ logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}")
88
 
89
  _apify_token_exists = bool(APIFY_API_TOKEN)
90
 
 
160
  return None
161
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
162
 
163
+
164
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
165
  """Fetches YouTube transcript using Apify REST API (async start + poll + dataset fetch)."""
166
+ global APIFY_ACTOR_ID
167
  if not video_url: logger.error("[Apify Async] No video_url provided"); return None
168
  if not api_token: logger.error("[Apify Async] API token missing."); return None
169
  logger.info(f"[Apify Async] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
170
 
171
+ # Use the standard /runs endpoint to start
172
  start_run_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/runs"
173
  params_base = {"token": api_token}
174
+ # *** FIX: Simplified payload, ensure it matches pocesar/youtube-scraper expectations ***
 
175
  payload = {
176
+ "startUrls": [{"url": video_url}],
177
+ # "proxyConfiguration": {"useApifyProxy": True}, # Keep proxy if needed
178
+ # Remove other less critical options for simplicity during debugging
179
+ # "subtitles": True, # Might be implicitly true or named differently
180
+ # "maxResults": 1
 
181
  }
182
  headers = {"Content-Type": "application/json"}
183
 
 
189
  try:
190
  async with httpx.AsyncClient(timeout=30.0) as client:
191
  # 1. Start the run
192
+ # *** FIX: Log exact URL, headers (partial), and payload before sending ***
193
+ log_headers = {k: v for k, v in headers.items() if k.lower() != 'authorization'} # Don't log token implicitly
194
+ logger.debug(f"[Apify Async] POST Request Details:\nURL: {start_run_endpoint}\nParams: {params_base}\nHeaders: {log_headers}\nPayload: {json.dumps(payload)}")
195
  response_start = await client.post(start_run_endpoint, headers=headers, params=params_base, json=payload)
196
  logger.debug(f"[Apify Async] Start run status: {response_start.status_code}")
197
 
 
204
  logger.info(f"[Apify Async] Run started. Run ID: {run_id}, Dataset ID: {dataset_id}")
205
  except Exception as e: logger.error(f"[Apify Async] Error parsing start run response: {e}. Response: {response_start.text[:200]}", exc_info=True); return None
206
  else:
207
+ error_info = ""; try: error_info = response_start.json().get("error", {}).get("message", "") except Exception: pass
 
 
 
208
  logger.error(f"[Apify Async] Failed to start run. Status: {response_start.status_code}. Error: {error_info} Resp: {response_start.text[:200]}")
209
  return None
210
 
 
228
  # 3. Fetch dataset items
229
  logger.info(f"[Apify Async] Run {run_id} succeeded. Fetching items from dataset {dataset_id}")
230
  dataset_endpoint = f"https://api.apify.com/v2/datasets/{dataset_id}/items"
231
+ params_dataset = {"token": api_token, "format": "json", "limit": 5} # Limit items fetched
232
  response_dataset = await client.get(dataset_endpoint, params=params_dataset)
233
  logger.debug(f"[Apify Async] Dataset fetch status: {response_dataset.status_code}")
234
  response_dataset.raise_for_status()
 
236
  results = response_dataset.json()
237
  if isinstance(results, list) and len(results) > 0:
238
  item = results[0]
239
+ # Try different keys based on pocesar/youtube-scraper output possibilities
240
+ content = item.get("subtitles") # Primary key for this actor usually
241
+ if not content: content = item.get("text")
242
+ if not content: content = item.get("transcript")
243
+
244
+ if isinstance(content, list): # Handle if it returns list of lines/segments
245
+ logger.info("[Apify Async] Processing list format from subtitles.")
246
+ # Check if list of dicts with 'text' key
247
+ if len(content) > 0 and isinstance(content[0], dict) and 'text' in content[0]:
248
+ content = " ".join(line.get("text", "") for line in content if line.get("text"))
249
+ # Check if list of simple strings
250
+ elif len(content) > 0 and isinstance(content[0], str):
251
+ content = " ".join(content)
252
+ else: # Unknown list format
253
+ content = None
254
+
255
  if content and isinstance(content, str):
256
  logger.info(f"[Apify Async] Success via ASYNC REST for {video_url}. Length: {len(content)}")
257
  return content.strip()
258
+ else: logger.warning(f"[Apify Async] Dataset item found but transcript empty/invalid format for {video_url}. Item: {item}"); return None
259
  else: logger.warning(f"[Apify Async] Dataset {dataset_id} was empty for {video_url}. Response: {results}"); return None
260
 
261
  except httpx.TimeoutException as e: logger.error(f"[Apify Async] Timeout during API interaction for {video_url}: {e}"); return None
 
263
  except httpx.RequestError as e: logger.error(f"[Apify Async] Request error during API interaction for {video_url}: {e}"); return None
264
  except Exception as e: logger.error(f"[Apify Async] Unexpected error during Apify Async REST call for {video_url}: {e}", exc_info=True); return None
265
 
266
+ # (get_youtube_transcript, get_website_content, get_website_content_via_api remain the same)
267
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
268
  global SUPADATA_API_KEY, APIFY_API_TOKEN
269
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
 
345
  except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
346
  except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
347
 
348
+
349
+ # --- Summarization Function ---
350
  async def generate_summary(text: str, summary_type: str) -> str:
351
  global OPENROUTER_API_KEY, OPENROUTER_MODEL
352
  logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
 
357
  if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
358
  full_prompt = f"{prompt}\n\n{text}"
359
  headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" }; payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] }; openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
360
+
361
+ # *** FIX: Define explicit timeouts dictionary ***
362
+ api_timeouts = httpx.Timeout(15.0, read=180.0, write=15.0, pool=60.0) # Connect, Read, Write, Pool
363
+ response = None
364
+
365
  try:
366
+ # *** FIX: Pass timeouts object to AsyncClient ***
367
+ async with httpx.AsyncClient(timeout=api_timeouts) as client:
368
+ logger.info(f"Sending request to OpenRouter ({OPENROUTER_MODEL}) with read timeout {api_timeouts.read}s...")
369
  response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
370
  if response: logger.info(f"Received response from OpenRouter. Status code: {response.status_code}")
371
  else: logger.error("No response received from OpenRouter after await completed (unexpected)."); return "Sorry, communication with the AI service failed unexpectedly."
372
+ # Process response (status checks etc.)
373
  if response.status_code == 200:
374
  try:
375
  data = response.json()
 
388
  elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Exceeded (429)."); return "Sorry, AI model is busy. Try again."
389
  elif response.status_code == 500: logger.error(f"OpenRouter Internal Server Error (500). Resp:{response.text[:500]}"); return "Sorry, AI service internal error."
390
  else: logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp:{response.text[:500]}"); return f"Sorry, AI service returned unexpected status ({response.status_code})."
391
+ except httpx.ReadTimeout: # Catch specific ReadTimeout
392
+ logger.error(f"Read Timeout error ({api_timeouts.read}s) waiting for OpenRouter API response.")
393
+ return f"Sorry, the request to the AI model timed out after {api_timeouts.read} seconds while waiting for a response. The content might be too long or the service busy. Please try again later or with shorter content."
394
+ except httpx.TimeoutException as e: # Catch other timeouts (connect, write, pool)
395
+ logger.error(f"Timeout error ({type(e)}) connecting to/writing to OpenRouter API: {e}")
396
+ return "Sorry, the request to the AI model timed out. Please try again."
397
  except httpx.RequestError as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was an error connecting to the AI model service."
398
  except Exception as e:
399
  logger.error(f"Unexpected error in generate_summary (OpenRouter request phase): {e}", exc_info=True)
400
  if response: logger.error(f"--> Last response status before error: {response.status_code}")
401
  return "Sorry, an unexpected error occurred while trying to generate the summary."
402
 
403
+
404
  # --- Background Task Processing ---
405
  # (process_summary_task remains the same)
406
  async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
 
534
  logger.error("Exception while handling an update:", exc_info=context.error)
535
 
536
  # --- Bot Setup ---
537
+ # (setup_bot_config remains the same)
538
  async def setup_bot_config() -> Application:
539
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
540
  if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
 
590
  else: logger.info("PTB application not initialized or failed.")
591
  logger.info("ASGI Lifespan: Shutdown complete.")
592
 
 
593
  # --- Starlette Route Handlers ---
594
  # (health_check and telegram_webhook remain the same)
595
  async def health_check(request: Request) -> PlainTextResponse:
 
614
  except json.JSONDecodeError: logger.error("Webhook invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
615
  except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK despite error
616
 
 
617
  # --- Create Starlette ASGI Application ---
618
  # (app definition remains the same)
619
  app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )