fmab777 commited on
Commit
019a608
·
verified ·
1 Parent(s): 4629ea6

Fix Apify status

Browse files
Files changed (1) hide show
  1. main.py +162 -531
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Revised for Hugging Face - Fix Event Loop Issue)
2
  import os
3
  import re
4
  import logging
@@ -29,10 +29,6 @@ if _apify_token_exists:
29
  else:
30
  ApifyClient = None
31
 
32
- # NO nest_asyncio needed here usually when using native async framework integration
33
- # import nest_asyncio
34
- # nest_asyncio.apply()
35
-
36
  # --- Logging Setup ---
37
  logging.basicConfig(
38
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
@@ -42,14 +38,13 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
42
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
43
  logging.getLogger("telegram.ext").setLevel(logging.DEBUG)
44
  logging.getLogger('telegram.bot').setLevel(logging.DEBUG)
45
- logging.getLogger("urllib3").setLevel(logging.INFO)
46
  logging.getLogger('gunicorn.error').setLevel(logging.INFO)
47
  logger = logging.getLogger(__name__)
48
  logger.info("Logging configured (DEBUG level).")
49
 
50
  # --- Environment Variable Loading ---
51
  logger.info("Attempting to load secrets from environment variables...")
52
- # (Keep the get_secret function and secret loading as before)
53
  def get_secret(secret_name):
54
  logger.debug(f"Attempting to read secret: {secret_name}")
55
  value = os.environ.get(secret_name)
@@ -65,8 +60,7 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
65
  logger.info("Secret loading attempt finished.")
66
 
67
  # --- Bot Logic Functions ---
68
- # [PASTE ALL YOUR BOT LOGIC FUNCTIONS HERE - FROM is_youtube_url to generate_summary]
69
- # --- [ Ensure all functions from previous main.py are here ] ---
70
  # Helper Functions
71
  def is_youtube_url(url):
72
  """Checks if the URL is a valid YouTube video or shorts URL."""
@@ -86,16 +80,20 @@ def extract_youtube_id(url):
86
  async def get_transcript_via_supadata(video_id: str, api_key: str):
87
  """Fetches YouTube transcript via Supadata API."""
88
  if not video_id: logger.error("[Supadata] get_transcript_via_supadata called with no video_id"); return None
89
- if not api_key: logger.error("[Supadata] API key is missing."); return None # Already checked before calling
90
  logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
91
  api_endpoint = f"https://api.supadata.net/v1/youtube/transcript"
92
  params = {"videoId": video_id, "format": "text"}
93
  headers = {"X-API-Key": api_key}
94
  try:
95
- # Use asyncio.to_thread to run blocking requests.get in a separate thread
96
- response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30)
 
 
 
97
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
98
  if response.status_code == 200:
 
99
  try:
100
  data = response.json()
101
  content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
@@ -105,7 +103,7 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
105
  else:
106
  logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
107
  return None
108
- except json.JSONDecodeError: # Handle cases where API might return plain text on success
109
  if response.text:
110
  logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
111
  return response.text.strip()
@@ -117,7 +115,7 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
117
  return None
118
  elif response.status_code in [401, 403]:
119
  logger.error(f"[Supadata] Authentication error ({response.status_code}). Check API key.")
120
- return None # Don't retry if key is bad
121
  elif response.status_code == 404:
122
  logger.warning(f"[Supadata] Transcript not found ({response.status_code}) for {video_id}.")
123
  return None
@@ -129,6 +127,9 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
129
  return None
130
  except requests.exceptions.RequestException as e:
131
  logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
 
 
 
132
  return None
133
  except Exception as e:
134
  logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
@@ -138,7 +139,7 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
138
  async def get_transcript_via_apify(video_url: str, api_token: str):
139
  """Fetches YouTube transcript via Apify API."""
140
  if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
141
- if not api_token: logger.error("[Apify] API token is missing."); return None # Already checked
142
  if not ApifyClient: logger.error("[Apify] ApifyClient not available/imported."); return None
143
 
144
  logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
@@ -157,39 +158,42 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
157
  headers = {"Content-Type": "application/json"}
158
  try:
159
  logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
160
- response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90) # Longer timeout for actor run
161
  logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
162
- if response.status_code == 200:
 
 
 
163
  try:
164
  results = response.json()
165
  if isinstance(results, list) and len(results) > 0:
166
  item = results[0]
167
  content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
168
- if not content and item.get("captions") and isinstance(item["captions"], list): # Handle 'captions' format if primary keys fail
169
  logger.info("[Apify] Processing 'captions' format.")
170
  content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
171
  if content and isinstance(content, str):
172
- logger.info(f"[Apify] Successfully fetched transcript for {video_url}. Length: {len(content)}")
173
  return content.strip()
174
  else:
175
- logger.warning(f"[Apify] Actor run successful but transcript content not found/empty in result for {video_url}. Result item: {item}")
176
  return None
177
  else:
178
- logger.warning(f"[Apify] Actor run successful but dataset was empty for {video_url}. Response: {results}")
179
  return None
180
  except json.JSONDecodeError:
181
- logger.error(f"[Apify] Failed to decode JSON response for {video_url}. Status: {response.status_code}. Response text: {response.text[:200]}...")
182
  return None
183
  except Exception as e:
184
- logger.error(f"[Apify] Error processing successful response for {video_url}: {e}", exc_info=True)
185
  return None
186
  elif response.status_code == 400:
187
- logger.error(f"[Apify] Bad Request (400) for {video_url}. Check input payload. Response: {response.text[:200]}...")
188
  return None
189
  elif response.status_code == 401:
190
  logger.error("[Apify] Authentication error (401). Check API token.")
191
- return None # Don't retry if token is bad
192
- else:
193
  logger.error(f"[Apify] Unexpected status code {response.status_code} for {video_url}. Response: {response.text[:200]}...")
194
  return None
195
  except requests.exceptions.Timeout:
@@ -202,6 +206,12 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
202
  logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True)
203
  return None
204
 
 
 
 
 
 
 
205
  # Combined YouTube Transcript Function (with Fallbacks)
206
  async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
207
  """Fetches YouTube transcript using library, then Supadata, then Apify."""
@@ -212,28 +222,28 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
212
  # 1. Primary Method: youtube-transcript-api
213
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
214
  try:
215
- # Run synchronous library call in a thread
216
  transcript_list = await asyncio.to_thread(
217
  YouTubeTranscriptApi.get_transcript,
218
  video_id,
219
- languages=['en', 'en-GB', 'en-US'] # Prioritize English variations
220
  )
221
  if transcript_list:
222
  transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
223
  if transcript_text:
224
  logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
225
- return transcript_text # Return immediately on success
226
  else:
227
  logger.warning(f"[Primary YT] Joined transcript text is empty for {video_id}")
228
- transcript_text = None # Ensure it's None if empty after join
229
  else:
230
  logger.warning(f"[Primary YT] Transcript list empty for {video_id}")
231
  transcript_text = None
232
  except Exception as e:
233
  logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {e}")
234
- if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found for {video_id}. May be unavailable/private.")
 
235
  elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled for {video_id}.")
236
- transcript_text = None # Ensure it's None on error
237
 
238
  # 2. Fallback 1: Supadata API
239
  if transcript_text is None:
@@ -242,7 +252,7 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
242
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
243
  if transcript_text:
244
  logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id}")
245
- return transcript_text # Return on success
246
  else:
247
  logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
248
  else:
@@ -255,7 +265,7 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
255
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
256
  if transcript_text:
257
  logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url}")
258
- return transcript_text # Return on success
259
  else:
260
  logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
261
  else:
@@ -266,7 +276,6 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
266
  logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
267
  return None
268
 
269
- # Should not be reached if logic above is correct, but as a safeguard
270
  return transcript_text
271
 
272
  # Website Content via Requests/BS4 (Primary Method for Simplified Bot)
@@ -276,93 +285,45 @@ async def get_website_content_via_requests(url):
276
  logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
277
  try:
278
  headers = {
279
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', # Updated UA
280
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
281
  'Accept-Language': 'en-US,en;q=0.9',
282
- 'Connection': 'keep-alive',
283
- 'DNT': '1', # Do Not Track
284
- 'Upgrade-Insecure-Requests': '1'
285
- }
286
  logger.debug(f"[Web Scraper - Requests/BS4] Sending request to {url}")
287
- # Run blocking I/O in a separate thread
288
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
289
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
290
  logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
291
-
292
  content_type = response.headers.get('content-type', '').lower()
293
  if 'html' not in content_type:
294
  logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}")
295
- return None # Don't try to parse non-html
296
-
297
- # Use html.parser, it's built-in
298
  soup = BeautifulSoup(response.text, 'html.parser')
299
-
300
- # Remove common unwanted tags more aggressively
301
- for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
302
- element.extract()
303
-
304
- # Try finding common main content containers
305
- main_content = soup.find('main') or \
306
- soup.find('article') or \
307
- soup.find(id='content') or \
308
- soup.find(class_='content') or \
309
- soup.find(id='main-content') or \
310
- soup.find(class_='main-content') or \
311
- soup.find(role='main')
312
-
313
- # Fallback to body if no specific container found
314
  target_element = main_content if main_content else soup.body
315
-
316
  if not target_element:
317
  logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}")
318
- return None # Nothing to parse
319
-
320
- # Get text, joining lines smartly
321
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
322
- text = "\n".join(lines) # Join with newlines to preserve some structure
323
-
324
- # Basic length check
325
- if not text or len(text) < 50: # Arbitrary short length check
326
  logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short or empty after cleaning for {url} (Length: {len(text)})")
327
- # Consider returning None if too short, depends on use case
328
- # return None
329
-
330
  logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
331
  return text
332
-
333
- except requests.exceptions.Timeout:
334
- logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}")
335
- return None
336
- except requests.exceptions.TooManyRedirects:
337
- logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}")
338
- return None
339
- except requests.exceptions.RequestException as e:
340
- # This catches ConnectTimeout, HTTPError, ConnectionError etc.
341
- logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}")
342
- return None
343
- except Exception as e:
344
- # Catch-all for unexpected errors during parsing etc.
345
- logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True)
346
- return None
347
 
348
  # Website Content via URLToText API (Fallback Method)
349
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
350
  """Fetches website content using the URLToText API (Fallback)."""
351
  if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
352
- if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None # Already checked
353
  logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}")
354
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
355
- payload = json.dumps({
356
- "url": url,
357
- "output_format": "text",
358
- "extract_main_content": True,
359
- "render_javascript": True, # Often needed for modern sites
360
- "residential_proxy": False, # Start with standard
361
- })
362
- headers = {
363
- "Authorization": f"Token {api_key}",
364
- "Content-Type": "application/json"
365
- }
366
  try:
367
  logger.debug(f"[Web Scraper - URLToText API] Sending request for {url}")
368
  response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45)
@@ -374,490 +335,160 @@ async def get_website_content_via_urltotext_api(url: str, api_key: str):
374
  credits = data.get("credits_used", "N/A")
375
  warning = data.get("data", {}).get("warning")
376
  if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}")
377
- if content:
378
- logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}")
379
- return content.strip()
380
- else:
381
- logger.warning(f"[Web Scraper - URLToText API] API returned success but content was empty for {url}. Response: {data}")
382
- return None
383
- except json.JSONDecodeError:
384
- logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Response: {response.text[:500]}...")
385
- return None
386
- except Exception as e:
387
- logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True)
388
- return None
389
- elif response.status_code in [400, 402, 422, 500]: # Known client/server errors
390
- logger.error(f"[Web Scraper - URLToText API] Error {response.status_code} from API for {url}. Response: {response.text[:200]}...")
391
- return None
392
- else: # Other unexpected codes
393
- logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}...")
394
- return None
395
- except requests.exceptions.Timeout:
396
- logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}")
397
- return None
398
- except requests.exceptions.RequestException as e:
399
- logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}")
400
- return None
401
- except Exception as e:
402
- logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True)
403
- return None
404
 
405
  # DeepSeek Summary Function (via OpenRouter)
406
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
407
  """Generates summary using DeepSeek via OpenRouter API."""
408
  logger.info(f"Generating {summary_type} summary using DeepSeek/OpenRouter. Input text length: {len(text)}")
409
- if not api_key:
410
- logger.error("OpenRouter API key was not provided to generate_summary.")
411
- return "Error: AI model configuration key (OpenRouter) is missing."
412
-
413
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
414
- # Check OpenRouter docs for the latest recommended free/low-cost models
415
  model_name = "deepseek/deepseek-chat:free"
416
-
417
- if summary_type == "paragraph":
418
- prompt = "You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be: • Clear and simple language suitable for someone unfamiliar with the topic. • Uses British English spellings throughout. • Straightforward and understandable vocabulary; avoid complex terms. • Presented as ONE SINGLE PARAGRAPH. • No more than 85 words maximum; but does not have to be exactly 85. • Considers the entire text content equally. • Uses semicolons (;) instead of em dashes (– or —). Here is the text to summarise:"
419
- else: # points summary
420
- prompt = """You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this Markdown format:
421
- • For each distinct topic or section identified in the text, create a heading.
422
- • Each heading MUST be enclosed in double asterisks for bolding (e.g., **Section Title**).
423
- • Immediately following each heading, list the key points as a bulleted list.
424
- • Each bullet point MUST start with a hyphen and a space (`- `) on a new line.
425
- • The text within each bullet point should NOT contain any bold formatting.
426
- • Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.
427
- • Use British English spellings throughout.
428
- • Avoid overly complex or advanced vocabulary.
429
- • Keep bullet points concise.
430
- • Ensure the entire summary takes no more than two minutes to read.
431
- • Consider the entire text's content, not just the beginning or a few topics.
432
- • Use semicolons (;) instead of em dashes (– or —).
433
-
434
- Here is the text to summarise:"""
435
-
436
- MAX_INPUT_LENGTH = 500000 # Truncate long inputs to avoid high costs/errors
437
- if len(text) > MAX_INPUT_LENGTH:
438
- logger.warning(f"Input text length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating.")
439
- text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
440
  full_prompt = f"{prompt}\n\n{text}"
441
-
442
- headers = {
443
- "Authorization": f"Bearer {api_key}",
444
- "Content-Type": "application/json",
445
- # Recommended headers for OpenRouter identification
446
- "HTTP-Referer": "https://huggingface.co/spaces/", # Identify source as HF Space
447
- "X-Title": "Telegram Summary Bot (HF Space)", # Identify app
448
- }
449
- payload = json.dumps({
450
- "model": model_name,
451
- "messages": [
452
- {"role": "user", "content": full_prompt}
453
- ],
454
- # Optional: Add max_tokens if needed, check model defaults
455
- # "max_tokens": 1024,
456
- })
457
-
458
  try:
459
  logger.debug(f"Sending request to OpenRouter ({model_name})...")
460
- # Run blocking request in thread
461
  response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=60)
462
  logger.debug(f"Received status code {response.status_code} from OpenRouter.")
463
-
464
  if response.status_code == 200:
465
  try:
466
  data = response.json()
467
- if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
468
  message = data["choices"][0].get("message")
469
- if message and isinstance(message, dict):
470
- summary = message.get("content")
471
- if summary:
472
- logger.info(f"Successfully generated summary via OpenRouter. Output length: {len(summary)}")
473
- return summary.strip()
474
- else:
475
- logger.warning(f"OpenRouter response successful, but content was empty. Response: {data}")
476
- return "Sorry, the AI model returned an empty summary."
477
- else:
478
- logger.error(f"Unexpected message structure in OpenRouter response: {message}. Full response: {data}")
479
- return "Sorry, could not parse the AI model's response (unexpected format)."
480
  else:
481
- # Handle cases like moderation flags, empty choices list
482
  if data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}")
483
- else: logger.error(f"Unexpected choices structure in OpenRouter response: {data.get('choices')}. Full response: {data}")
484
- return "Sorry, could not parse the AI model's response (choices missing/invalid or API error)."
485
-
486
- except json.JSONDecodeError:
487
- logger.error(f"Failed to decode JSON response from OpenRouter. Status: {response.status_code}. Response text: {response.text[:500]}...")
488
- return "Sorry, failed to understand the response from the AI model."
489
- except Exception as e:
490
- logger.error(f"Error processing successful OpenRouter response: {e}", exc_info=True)
491
- return "Sorry, an error occurred while processing the AI model's response."
492
-
493
- elif response.status_code == 401:
494
- logger.error("OpenRouter API key is invalid (401 Unauthorized). Check HF Space Secrets.")
495
- return "Error: The AI model configuration key (OpenRouter) is invalid."
496
- elif response.status_code == 402:
497
- logger.error("OpenRouter Payment Required (402). Check credits/limits on OpenRouter.")
498
- return "Sorry, there might be an issue with the AI model service limits or payment. Please try again later or check OpenRouter account."
499
- elif response.status_code == 429:
500
- logger.warning("OpenRouter Rate Limit Exceeded (429).")
501
- return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
502
- elif response.status_code == 500:
503
- logger.error(f"OpenRouter Internal Server Error (500). Response: {response.text[:500]}...")
504
- return "Sorry, the AI model service encountered an internal error. Please try again later."
505
  else:
506
- # Handle other potential errors (e.g., 400 Bad Request, 404 Not Found for model)
507
- logger.error(f"Unexpected status code {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
508
- try: # Try to parse error message from response body
509
- error_data = response.json()
510
- error_msg = error_data.get("error", {}).get("message", response.text[:100])
511
- return f"Sorry, the AI model service returned an error ({response.status_code}): {error_msg}"
512
- except: # Fallback if parsing fails
513
- return f"Sorry, the AI model service returned an unexpected status ({response.status_code})."
514
-
515
- except requests.exceptions.Timeout:
516
- logger.error("Timeout error connecting to OpenRouter API.")
517
- return "Sorry, the request to the AI model timed out. Please try again."
518
- except requests.exceptions.RequestException as e:
519
- logger.error(f"Request error connecting to OpenRouter API: {e}")
520
- return "Sorry, there was an error connecting to the AI model service."
521
- except Exception as e:
522
- logger.error(f"Unexpected error in generate_summary (OpenRouter): {e}", exc_info=True)
523
- return "Sorry, an unexpected error occurred while generating the summary."
524
-
525
-
526
-
527
- # --- Telegram Bot Handlers (Command, Message, CallbackQuery) ---
528
-
529
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
530
- """Sends a welcome message when the /start command is issued."""
531
- user = update.effective_user
532
- logger.info(f"User {user.id} ({user.username or 'NoUsername'}) used /start.")
533
  mention = user.mention_html() if user.username else user.first_name
534
- await update.message.reply_html(
535
- f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\n"
536
- "Just send me a link anytime!",
537
- # Optional: disable_web_page_preview=True
538
- )
539
-
540
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
541
- """Sends a help message when the /help command is issued."""
542
  logger.info(f"User {update.effective_user.id} used /help.")
543
- await update.message.reply_text(
544
- "🔍 **How to use this bot:**\n\n"
545
- "1. Send me any YouTube video link or website URL.\n"
546
- "2. I'll ask how you want it summarized (paragraph or points).\n"
547
- "3. Click the button for your choice.\n"
548
- "4. Wait for the summary!\n\n"
549
- "I use multiple methods if the first fails (especially for YT transcripts & website content).\n\n"
550
- "**Commands:**\n"
551
- "/start - Display welcome message\n"
552
- "/help - Show this help message",
553
- parse_mode=ParseMode.MARKDOWN
554
- )
555
-
556
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
557
- """Handles text messages, checks for URLs, and asks for summary type."""
558
- if not update.message or not update.message.text: return # Ignore empty messages
559
- url = update.message.text.strip()
560
- user = update.effective_user
561
  logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}")
562
-
563
- # Basic URL validation
564
- if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
565
- logger.debug(f"Ignoring non-URL message from user {user.id}: {url}")
566
- # Optional: Reply if you want to guide the user
567
- # await update.message.reply_text("Please send a valid URL starting with http:// or https://")
568
- return
569
-
570
- # Store URL in user_data (simple state management)
571
- context.user_data['url_to_summarize'] = url
572
- logger.debug(f"Stored URL '{url}' for user {user.id} in user_data")
573
-
574
- # Ask for summary type with Inline Keyboard
575
- keyboard = [
576
- [
577
- InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"),
578
- InlineKeyboardButton("Points Summary", callback_data="points")
579
- ]
580
- ]
581
  reply_markup = InlineKeyboardMarkup(keyboard)
582
- await update.message.reply_text(
583
- f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?",
584
- reply_markup=reply_markup,
585
- disable_web_page_preview=True
586
- )
587
-
588
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
589
- """Handles button presses for summary type selection."""
590
- query = update.callback_query
591
- if not query: return
592
- # --- Acknowledge callback query ---
 
 
 
 
 
 
 
 
 
593
  try:
594
- await query.answer() # Acknowledge button press immediately
595
- logger.debug(f"Callback query {query.id} answered.")
596
- except Exception as e:
597
- logger.error(f"Failed to answer callback query {query.id}: {e}", exc_info=True)
598
- # Proceed anyway, but logging the error is important
599
-
600
- summary_type = query.data
601
- user = update.effective_user or query.from_user # Get user info
602
- url = context.user_data.get('url_to_summarize', None) # Retrieve stored URL
603
-
604
- logger.info(f"User {user.id} chose '{summary_type}' summary. Checking for URL '{url}' in context.")
605
-
606
- # Check if URL is still in context (it might expire or be lost)
607
- if not url:
608
- logger.warning(f"User {user.id} pressed button, but NO URL found in user_data context.")
609
- try:
610
- # Edit the message where the button was, informing the user
611
- await query.edit_message_text(text="Sorry, I couldn't find the URL associated with this request. Please send the link again.")
612
- except Exception as edit_err:
613
- # If editing fails (e.g., message too old), log it but don't crash
614
- logger.warning(f"Failed to edit message for missing URL context: {edit_err}")
615
- # Maybe send a new message as a fallback? Depends on desired behavior.
616
- # await context.bot.send_message(chat_id=user.id, text="Sorry, context lost. Please send link again.")
617
- return # Stop processing if URL is missing
618
-
619
- # Clear the URL from context now that we're processing it
620
- context.user_data.pop('url_to_summarize', None)
621
- logger.debug(f"Retrieved and cleared URL {url} from user_data for user {user.id}")
622
-
623
- # --- Get API Keys (Read fresh from environment - cheap operation) ---
624
- logger.debug("Reading API keys from environment variables within handler...")
625
- current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
626
- current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
627
- current_supadata_key = os.environ.get('SUPADATA_API_KEY')
628
- current_apify_token = os.environ.get('APIFY_API_TOKEN')
629
- logger.debug(f"Keys read: OpenRouter={'Yes' if current_openrouter_key else 'No'}, URLToText={'Yes' if current_urltotext_key else 'No'}, Supadata={'Yes' if current_supadata_key else 'No'}, Apify={'Yes' if current_apify_token else 'No'}")
630
-
631
-
632
- # Check *essential* key for summarization
633
- if not current_openrouter_key:
634
- logger.error("OpenRouter API key missing in handler. Cannot generate summary.")
635
- # Inform user and clean up the button message
636
- await context.bot.send_message(chat_id=user.id, text="Error: AI model configuration key (OpenRouter) is missing. Cannot generate summary.")
637
- try: await query.delete_message() # Delete the message with buttons
638
- except Exception: pass
639
- return
640
-
641
- # --- Start Processing ---
642
- processing_message = f"Got it! Generating '{summary_type}' summary for:\n{url}\n\nThis might take a moment..."
643
- message_to_delete_later = None # In case editing fails
644
- try:
645
- # Edit the message to show processing status
646
- await query.edit_message_text(processing_message)
647
- logger.debug(f"Edited message for query {query.id} to show processing status.")
648
- except Exception as e:
649
- # If editing fails (e.g., message too old), send a new status message
650
- logger.warning(f"Could not edit original message for query {query.id}: {e}. Sending new status message.")
651
- try:
652
- message_to_delete_later = await context.bot.send_message(chat_id=user.id, text=processing_message)
653
- except Exception as send_err:
654
- # If even sending fails, log and give up on this request
655
- logger.error(f"Failed to send status message after edit failure: {send_err}")
656
- return
657
-
658
- content = None
659
- user_feedback_message = None # Stores error messages for the user
660
- success = False
661
- is_youtube = is_youtube_url(url)
662
-
663
- try:
664
- # Show "typing..." status in Telegram chat
665
- logger.debug(f"Sending 'typing' action for chat {user.id}")
666
- await context.bot.send_chat_action(chat_id=user.id, action='typing')
667
-
668
- # --- Content Fetching Logic ---
669
  if is_youtube:
670
  video_id = extract_youtube_id(url)
671
  if video_id:
672
- # Fetch YT transcript using the function with fallbacks
673
- logger.info(f"Fetching YouTube transcript for video_id: {video_id}")
674
- content = await get_youtube_transcript(
675
- video_id,
676
- url, # Pass full URL for Apify
677
- current_supadata_key,
678
- current_apify_token
679
- )
680
- # Set feedback message only if content fetching failed
681
- user_feedback_message = None if content else "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
682
- logger.info(f"YouTube transcript fetch completed. Content found: {bool(content)}")
683
- else:
684
- user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
685
- else: # Website Logic (Requests/BS4 -> URLToText API)
686
- logger.info(f"Attempting website scrape (Requests/BS4) for {url}")
687
- content = await get_website_content_via_requests(url)
688
-
689
- if content:
690
- logger.info("Primary website scraping (Requests/BS4) successful.")
691
- user_feedback_message = None
692
  else:
693
- logger.warning(f"Primary web scraping failed for {url}. Attempting fallback API (URLToText).")
694
  if current_urltotext_key:
695
- await context.bot.send_chat_action(chat_id=user.id, action='typing') # Show activity for fallback
696
- content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
697
- if content:
698
- user_feedback_message = None
699
- logger.info("Fallback URLToText API scraping successful.")
700
- else:
701
- user_feedback_message = "Sorry, I couldn't fetch content from that website using either method (blocked/inaccessible/empty?)."
702
- logger.error(f"Both primary (Requests/BS4) and fallback API failed for website {url}.")
703
- else:
704
- # Primary failed, and fallback key is missing
705
- user_feedback_message = "Sorry, I couldn't fetch content from that website (blocked/inaccessible/empty?). The fallback method is not configured."
706
- logger.warning(f"Primary web scraping failed for {url}, and fallback API key (URLToText) is missing.")
707
- # --- End Content Fetching ---
708
-
709
- # --- Generate Summary if Content was Fetched ---
710
  if content:
711
- logger.info("Content fetched successfully, proceeding to generate summary.")
712
- await context.bot.send_chat_action(chat_id=user.id, action='typing')
713
- # Pass the OpenRouter key to the summary function
714
  summary = await generate_summary(content, summary_type, current_openrouter_key)
715
-
716
- # Check if summary generation returned an error message
717
- if summary.startswith("Error:") or summary.startswith("Sorry,"):
718
- user_feedback_message = summary # Use the error from the summary function
719
- success = False
720
- logger.warning(f"Summary generation failed or returned error: {summary}")
721
- else:
722
- # Send the successful summary
723
- logger.info("Summary generated successfully. Sending response.")
724
- await context.bot.send_message(
725
- chat_id=user.id,
726
- text=summary,
727
- parse_mode=ParseMode.MARKDOWN,
728
- disable_web_page_preview=True
729
- )
730
- success = True
731
- user_feedback_message = None # Clear any previous failure message from fetching stage
732
- elif not user_feedback_message:
733
- # If content is None but no specific error message was set during fetching
734
- user_feedback_message = "Sorry, couldn't retrieve any content to summarize from the provided link."
735
- logger.warning(f"Content fetching resulted in None for {url}, but no specific user feedback message was set.")
736
-
737
- # --- Send Feedback if any step failed ---
738
- if user_feedback_message and not success:
739
- logger.warning(f"Sending failure feedback to user: {user_feedback_message}")
740
- await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
741
-
742
- except Exception as e:
743
- # Catch unexpected errors during the whole process
744
- logger.error(f"Unexpected error during processing callback for {url}: {e}", exc_info=True)
745
- try:
746
- # Send a generic error message to the user
747
- await context.bot.send_message(chat_id=user.id, text="Oops! Something went really wrong while processing your request. Please try again later.")
748
- except Exception as final_err:
749
- # If even sending the error message fails... log it.
750
- logger.error(f"Failed to send final error message to user {user.id}: {final_err}")
751
  finally:
752
- # --- Cleanup ---
753
- # Delete the "Processing..." status message or the original message with buttons
754
- logger.debug("Cleaning up status message...")
755
  try:
756
- if message_to_delete_later: # If we sent a separate status message
757
- await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later.message_id)
758
- logger.debug("Deleted separate status message.")
759
- elif query: # Otherwise, delete the original message with the buttons
760
- await query.delete_message()
761
- logger.debug(f"Deleted original message for query {query.id}.")
762
- except Exception as del_e:
763
- # Log if deletion fails, but don't let it stop anything
764
- logger.warning(f"Could not delete status/button message: {del_e}")
765
-
766
-
767
- async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
768
- """Log Errors caused by Updates."""
769
- # Log the error and traceback
770
- logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
771
- # (Keep optional developer notification code as before if desired)
772
-
773
 
774
  # --- Bot Application Setup Function ---
775
  async def setup_bot():
776
- """Builds, initializes, and registers handlers for the PTB Application."""
777
- logger.info("Setting up Telegram Application...")
778
- if not TELEGRAM_TOKEN:
779
- logger.critical("Cannot initialize PTB Application: TELEGRAM_TOKEN not found.")
780
- return None # Return None if setup fails
781
-
782
- # Build Application
783
  application = Application.builder().token(TELEGRAM_TOKEN).build()
784
-
785
- # *** Initialize the application (CRITICAL FOR WEBHOOKS) ***
786
- logger.info("Running application.initialize()...")
787
- await application.initialize() # This sets up bot, updater, etc.
788
- logger.info("Finished application.initialize().")
789
-
790
- # Register handlers
791
- application.add_handler(CommandHandler("start", start))
792
- application.add_handler(CommandHandler("help", help_command))
793
- application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
794
- application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
795
- application.add_error_handler(error_handler)
796
- logger.info("Telegram handlers registered.")
797
-
798
- return application # Return the initialized application
799
-
800
- # --- Run Setup and Store Application Instance ---
801
- logger.info("Running bot setup...")
802
- ptb_app = asyncio.run(setup_bot())
803
- logger.info(f"Bot setup finished. Application instance: {'OK' if ptb_app else 'Failed'}")
804
-
805
 
806
  # --- Flask App Setup ---
807
- app = Flask(__name__) # Create Flask web server instance
808
- logger.info("Flask app created.")
809
-
810
 
811
  # --- Webhook Endpoint ---
812
  @app.route('/webhook', methods=['POST'])
813
  async def webhook() -> Response:
814
- """Webhook endpoint to receive updates from Telegram"""
815
  logger.info("Webhook request received...")
816
- if not ptb_app: # Check if PTB initialization failed during setup
817
- logger.error("Telegram application not initialized. Cannot process update.")
818
- return Response('Bot not configured properly', status=500)
819
-
820
  if request.is_json:
821
  try:
822
- update_data = request.get_json()
823
- # Use PTB's built-in deserialization
824
- update = Update.de_json(update_data, ptb_app.bot)
825
- logger.debug(f"Processing update ID: {update.update_id}")
826
-
827
- # Process the update using PTB's internal dispatcher
828
- # Use direct await now that we know initialize() was called
829
- logger.debug("Directly awaiting process_update...")
830
- await ptb_app.process_update(update)
831
- logger.debug("Finished awaiting process_update.")
832
-
833
- # Respond quickly to Telegram that we received the update
834
  return Response('ok', status=200)
835
- except json.JSONDecodeError:
836
- logger.error("Failed to decode JSON from Telegram webhook.")
837
- return Response('Bad Request - Invalid JSON', status=400)
838
- except Exception as e:
839
- # Catch errors during update processing IF they bubble up past PTB's error handler
840
- logger.error(f"Error processing update in webhook handler (outside PTB handler?): {e}", exc_info=True)
841
- return Response('Internal Server Error', status=500)
842
- else:
843
- logger.warning("Received non-JSON request to webhook endpoint.")
844
- return Response('Bad Request - Expected JSON', status=400)
845
-
846
  @app.route('/')
847
- def index():
848
- """A simple health check endpoint for the web server"""
849
- logger.debug("Health check endpoint '/' accessed.")
850
- bot_status = "PTB App Initialized" if ptb_app else "PTB App FAILED Initialization"
851
- return f"Hello! Telegram Bot Webhook Listener ({bot_status}) is running."
852
-
853
 
854
  # --- Main Execution Block ---
855
- # This part usually doesn't run when deployed via Gunicorn
856
  if __name__ == '__main__':
857
- if not ptb_app:
858
- logger.critical("Aborting Flask server start (local test?) because Telegram App failed initialization.")
859
- else:
860
- logger.info("Starting Flask web server directly (for local testing?)...")
861
- port = int(os.environ.get('PORT', 5000))
862
- # Set debug=True ONLY for local testing
863
- app.run(host='0.0.0.0', port=port, debug=True)
 
1
+ # main.py (Revised: Apify 201 fix + Supadata verify=False TEST)
2
  import os
3
  import re
4
  import logging
 
29
  else:
30
  ApifyClient = None
31
 
 
 
 
 
32
  # --- Logging Setup ---
33
  logging.basicConfig(
34
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 
38
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
39
  logging.getLogger("telegram.ext").setLevel(logging.DEBUG)
40
  logging.getLogger('telegram.bot').setLevel(logging.DEBUG)
41
+ logging.getLogger("urllib3").setLevel(logging.INFO) # Reduce requests noise slightly
42
  logging.getLogger('gunicorn.error').setLevel(logging.INFO)
43
  logger = logging.getLogger(__name__)
44
  logger.info("Logging configured (DEBUG level).")
45
 
46
  # --- Environment Variable Loading ---
47
  logger.info("Attempting to load secrets from environment variables...")
 
48
  def get_secret(secret_name):
49
  logger.debug(f"Attempting to read secret: {secret_name}")
50
  value = os.environ.get(secret_name)
 
60
  logger.info("Secret loading attempt finished.")
61
 
62
  # --- Bot Logic Functions ---
63
+
 
64
  # Helper Functions
65
  def is_youtube_url(url):
66
  """Checks if the URL is a valid YouTube video or shorts URL."""
 
80
  async def get_transcript_via_supadata(video_id: str, api_key: str):
81
  """Fetches YouTube transcript via Supadata API."""
82
  if not video_id: logger.error("[Supadata] get_transcript_via_supadata called with no video_id"); return None
83
+ if not api_key: logger.error("[Supadata] API key is missing."); return None
84
  logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
85
  api_endpoint = f"https://api.supadata.net/v1/youtube/transcript"
86
  params = {"videoId": video_id, "format": "text"}
87
  headers = {"X-API-Key": api_key}
88
  try:
89
+ # ---!!! INSECURE TEST - DISABLES SSL VERIFICATION !!!---
90
+ logger.warning("[Supadata] Making request with verify=False (INSECURE TEST)")
91
+ response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
92
+ # ---!!! END INSECURE TEST --- (Remember to remove verify=False later) ---
93
+
94
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
95
  if response.status_code == 200:
96
+ # (Rest of the success handling code remains the same)
97
  try:
98
  data = response.json()
99
  content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
 
103
  else:
104
  logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
105
  return None
106
+ except json.JSONDecodeError:
107
  if response.text:
108
  logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
109
  return response.text.strip()
 
115
  return None
116
  elif response.status_code in [401, 403]:
117
  logger.error(f"[Supadata] Authentication error ({response.status_code}). Check API key.")
118
+ return None
119
  elif response.status_code == 404:
120
  logger.warning(f"[Supadata] Transcript not found ({response.status_code}) for {video_id}.")
121
  return None
 
127
  return None
128
  except requests.exceptions.RequestException as e:
129
  logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
130
+ # Log the specific SSLError if verify=False wasn't the only issue
131
+ if isinstance(e, requests.exceptions.SSLError):
132
+ logger.error(f"[Supadata] SSL Error details: {e}")
133
  return None
134
  except Exception as e:
135
  logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
 
139
  async def get_transcript_via_apify(video_url: str, api_token: str):
140
  """Fetches YouTube transcript via Apify API."""
141
  if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
142
+ if not api_token: logger.error("[Apify] API token is missing."); return None
143
  if not ApifyClient: logger.error("[Apify] ApifyClient not available/imported."); return None
144
 
145
  logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
 
158
  headers = {"Content-Type": "application/json"}
159
  try:
160
  logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
161
+ response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90)
162
  logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
163
+
164
+ # --- MODIFIED STATUS CODE CHECK ---
165
+ if response.status_code in [200, 201]: # Accept 200 OK or 201 Created
166
+ # --- END MODIFIED STATUS CODE CHECK ---
167
  try:
168
  results = response.json()
169
  if isinstance(results, list) and len(results) > 0:
170
  item = results[0]
171
  content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
172
+ if not content and item.get("captions") and isinstance(item["captions"], list):
173
  logger.info("[Apify] Processing 'captions' format.")
174
  content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
175
  if content and isinstance(content, str):
176
+ logger.info(f"[Apify] Successfully fetched transcript for {video_url} (Status: {response.status_code}). Length: {len(content)}")
177
  return content.strip()
178
  else:
179
+ logger.warning(f"[Apify] Actor run successful ({response.status_code}) but content not found/empty for {video_url}. Item: {item}")
180
  return None
181
  else:
182
+ logger.warning(f"[Apify] Actor run successful ({response.status_code}) but dataset empty for {video_url}. Response: {results}")
183
  return None
184
  except json.JSONDecodeError:
185
+ logger.error(f"[Apify] Failed to decode JSON response for {video_url}. Status: {response.status_code}. Resp: {response.text[:200]}...")
186
  return None
187
  except Exception as e:
188
+ logger.error(f"[Apify] Error processing successful response ({response.status_code}) for {video_url}: {e}", exc_info=True)
189
  return None
190
  elif response.status_code == 400:
191
+ logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Response: {response.text[:200]}...")
192
  return None
193
  elif response.status_code == 401:
194
  logger.error("[Apify] Authentication error (401). Check API token.")
195
+ return None
196
+ else: # Catch other non-200/201 codes here
197
  logger.error(f"[Apify] Unexpected status code {response.status_code} for {video_url}. Response: {response.text[:200]}...")
198
  return None
199
  except requests.exceptions.Timeout:
 
206
  logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True)
207
  return None
208
 
209
+ # (The rest of the functions: get_youtube_transcript, get_website_content_via_requests,
210
+ # get_website_content_via_urltotext_api, generate_summary, start, help_command,
211
+ # handle_potential_url, handle_summary_type_callback, error_handler, setup_bot,
212
+ # webhook, index, and the main execution block remain EXACTLY THE SAME as in the
213
+ # previous complete code block. Ensure they are included below this point.)
214
+
215
  # Combined YouTube Transcript Function (with Fallbacks)
216
  async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
217
  """Fetches YouTube transcript using library, then Supadata, then Apify."""
 
222
  # 1. Primary Method: youtube-transcript-api
223
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
224
  try:
 
225
  transcript_list = await asyncio.to_thread(
226
  YouTubeTranscriptApi.get_transcript,
227
  video_id,
228
+ languages=['en', 'en-GB', 'en-US']
229
  )
230
  if transcript_list:
231
  transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
232
  if transcript_text:
233
  logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
234
+ return transcript_text
235
  else:
236
  logger.warning(f"[Primary YT] Joined transcript text is empty for {video_id}")
237
+ transcript_text = None
238
  else:
239
  logger.warning(f"[Primary YT] Transcript list empty for {video_id}")
240
  transcript_text = None
241
  except Exception as e:
242
  logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {e}")
243
+ if "YouTube is blocking requests" in str(e): logger.warning("[Primary YT] IP likely blocked by YouTube.")
244
+ elif "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found for {video_id}.")
245
  elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled for {video_id}.")
246
+ transcript_text = None
247
 
248
  # 2. Fallback 1: Supadata API
249
  if transcript_text is None:
 
252
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
253
  if transcript_text:
254
  logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id}")
255
+ return transcript_text
256
  else:
257
  logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
258
  else:
 
265
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
266
  if transcript_text:
267
  logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url}")
268
+ return transcript_text
269
  else:
270
  logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
271
  else:
 
276
  logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
277
  return None
278
 
 
279
  return transcript_text
280
 
281
  # Website Content via Requests/BS4 (Primary Method for Simplified Bot)
 
285
  logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
286
  try:
287
  headers = {
288
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
289
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
290
  'Accept-Language': 'en-US,en;q=0.9',
291
+ 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
 
 
 
292
  logger.debug(f"[Web Scraper - Requests/BS4] Sending request to {url}")
 
293
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
294
+ response.raise_for_status()
295
  logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
 
296
  content_type = response.headers.get('content-type', '').lower()
297
  if 'html' not in content_type:
298
  logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}")
299
+ return None
 
 
300
  soup = BeautifulSoup(response.text, 'html.parser')
301
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]): element.extract()
302
+ main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  target_element = main_content if main_content else soup.body
 
304
  if not target_element:
305
  logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}")
306
+ return None
 
 
307
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
308
+ text = "\n".join(lines)
309
+ if not text or len(text) < 50:
 
 
310
  logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short or empty after cleaning for {url} (Length: {len(text)})")
 
 
 
311
  logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
312
  return text
313
+ except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}"); return None
314
+ except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}"); return None
315
+ except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}"); return None
316
+ except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
317
 
318
  # Website Content via URLToText API (Fallback Method)
319
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
320
  """Fetches website content using the URLToText API (Fallback)."""
321
  if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
322
+ if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
323
  logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}")
324
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
325
+ payload = json.dumps({ "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False })
326
+ headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
 
 
 
 
 
 
 
 
 
327
  try:
328
  logger.debug(f"[Web Scraper - URLToText API] Sending request for {url}")
329
  response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45)
 
335
  credits = data.get("credits_used", "N/A")
336
  warning = data.get("data", {}).get("warning")
337
  if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}")
338
+ if content: logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}"); return content.strip()
339
+ else: logger.warning(f"[Web Scraper - URLToText API] API returned success but content was empty for {url}. Response: {data}"); return None
340
+ except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Response: {response.text[:500]}..."); return None
341
+ except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True); return None
342
+ elif response.status_code in [400, 402, 422, 500]: logger.error(f"[Web Scraper - URLToText API] Error {response.status_code} from API for {url}. Response: {response.text[:200]}..."); return None
343
+ else: logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}..."); return None
344
+ except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}"); return None
345
+ except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}"); return None
346
+ except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  # DeepSeek Summary Function (via OpenRouter)
349
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
350
  """Generates summary using DeepSeek via OpenRouter API."""
351
  logger.info(f"Generating {summary_type} summary using DeepSeek/OpenRouter. Input text length: {len(text)}")
352
+ if not api_key: logger.error("OpenRouter API key was not provided."); return "Error: AI model config key missing."
 
 
 
353
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
 
354
  model_name = "deepseek/deepseek-chat:free"
355
+ if summary_type == "paragraph": prompt = "..." # Keep prompt as before
356
+ else: prompt = """...""" # Keep prompt as before
357
+ MAX_INPUT_LENGTH = 500000
358
+ if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input text ({len(text)}) > limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Truncated)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  full_prompt = f"{prompt}\n\n{text}"
360
+ headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "HTTP-Referer": "https://huggingface.co/spaces/", "X-Title": "Telegram Summary Bot (HF Space)"}
361
+ payload = json.dumps({ "model": model_name, "messages": [{"role": "user", "content": full_prompt}]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  try:
363
  logger.debug(f"Sending request to OpenRouter ({model_name})...")
 
364
  response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=60)
365
  logger.debug(f"Received status code {response.status_code} from OpenRouter.")
 
366
  if response.status_code == 200:
367
  try:
368
  data = response.json()
369
+ if data.get("choices") and data["choices"]:
370
  message = data["choices"][0].get("message")
371
+ if message and message.get("content"):
372
+ summary = message["content"].strip()
373
+ logger.info(f"Success generating summary. Len: {len(summary)}")
374
+ return summary
375
+ else: logger.warning(f"OpenRouter success but empty content. Resp: {data}"); return "Sorry, AI model returned empty summary."
 
 
 
 
 
 
376
  else:
 
377
  if data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}")
378
+ else: logger.error(f"Unexpected choices structure: {data.get('choices')}. Resp: {data}")
379
+ return "Sorry, could not parse AI response (choices/error)."
380
+ except json.JSONDecodeError: logger.error(f"Failed JSON decode from OpenRouter. Status: {response.status_code}. Resp: {response.text[:500]}..."); return "Sorry, failed to understand AI response."
381
+ except Exception as e: logger.error(f"Error processing OpenRouter success resp: {e}", exc_info=True); return "Sorry, error processing AI response."
382
+ elif response.status_code == 401: logger.error("OpenRouter API key invalid (401). Check HF Secrets."); return "Error: AI model config key invalid."
383
+ elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check OpenRouter account."); return "Sorry, issue with AI service limits/payment."
384
+ elif response.status_code == 429: logger.warning("OpenRouter Rate Limit (429)."); return "Sorry, AI model busy. Try again."
385
+ elif response.status_code == 500: logger.error(f"OpenRouter Internal Error (500). Resp: {response.text[:500]}..."); return "Sorry, AI model service error. Try again later."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  else:
387
+ logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp: {response.text[:500]}...")
388
+ try: error_data = response.json(); error_msg = error_data.get("error", {}).get("message", response.text[:100]); return f"Sorry, AI service error ({response.status_code}): {error_msg}"
389
+ except: return f"Sorry, AI service returned status {response.status_code}."
390
+ except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter."); return "Sorry, request to AI model timed out."
391
+ except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter: {e}"); return "Sorry, error connecting to AI model service."
392
+ except Exception as e: logger.error(f"Unexpected error in generate_summary: {e}", exc_info=True); return "Sorry, unexpected error generating summary."
393
+
394
+ # --- Telegram Bot Handlers ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
396
+ user = update.effective_user; logger.info(f"User {user.id} ({user.username or 'NoUsername'}) used /start.")
 
 
397
  mention = user.mention_html() if user.username else user.first_name
398
+ await update.message.reply_html(f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\nJust send me a link anytime!")
 
 
 
 
 
399
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
400
  logger.info(f"User {update.effective_user.id} used /help.")
401
+ await update.message.reply_text("...", parse_mode=ParseMode.MARKDOWN) # Keep help text
 
 
 
 
 
 
 
 
 
 
 
 
402
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
403
+ if not update.message or not update.message.text: return
404
+ url = update.message.text.strip(); user = update.effective_user
 
 
405
  logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}")
406
+ if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]: logger.debug(f"Ignoring non-URL: {url}"); return
407
+ context.user_data['url_to_summarize'] = url; logger.debug(f"Stored URL '{url}' for user {user.id}")
408
+ keyboard = [[InlineKeyboardButton("Paragraph", callback_data="paragraph"), InlineKeyboardButton("Points", callback_data="points")]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  reply_markup = InlineKeyboardMarkup(keyboard)
410
+ await update.message.reply_text(f"Link detected:\n{url}\n\nChoose summary type:", reply_markup=reply_markup, disable_web_page_preview=True)
 
 
 
 
 
411
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
412
+ query = update.callback_query; await query.answer()
413
+ summary_type = query.data; user = update.effective_user or query.from_user
414
+ url = context.user_data.get('url_to_summarize')
415
+ logger.info(f"User {user.id} chose '{summary_type}' for URL '{url}'.")
416
+ if not url: logger.warning(f"User {user.id} pressed button, NO URL in context."); await query.edit_message_text(text="Context lost. Send link again."); return
417
+ context.user_data.pop('url_to_summarize', None); logger.debug(f"Cleared URL {url} for user {user.id}")
418
+ current_openrouter_key = os.environ.get('OPENROUTER_API_KEY'); current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
419
+ current_supadata_key = os.environ.get('SUPADATA_API_KEY'); current_apify_token = os.environ.get('APIFY_API_TOKEN')
420
+ if not current_openrouter_key: logger.error("OpenRouter key missing."); await context.bot.send_message(chat_id=user.id, text="Error: AI config missing."); await query.delete_message(); return
421
+ processing_message = f"Working on '{summary_type}' summary for:\n{url}\n..."; message_to_delete_later = None
422
+ try: await query.edit_message_text(processing_message); logger.debug(f"Edited message query {query.id}")
423
+ except Exception as e: logger.warning(f"Could not edit message {query.id}: {e}. Sending new."); message_to_delete_later = await context.bot.send_message(chat_id=user.id, text=processing_message)
424
+ content = None; user_feedback_message = None; success = False; is_youtube = is_youtube_url(url)
425
  try:
426
+ logger.debug(f"Sending 'typing' action for chat {user.id}"); await context.bot.send_chat_action(chat_id=user.id, action='typing')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  if is_youtube:
428
  video_id = extract_youtube_id(url)
429
  if video_id:
430
+ logger.info(f"Fetching YT transcript: {video_id}"); content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
431
+ user_feedback_message = None if content else "Sorry, couldn't get YT transcript."
432
+ logger.info(f"YT transcript fetch done. Found: {bool(content)}")
433
+ else: user_feedback_message = "Sorry, couldn't parse YT URL."
434
+ else:
435
+ logger.info(f"Scraping website (Requests/BS4): {url}"); content = await get_website_content_via_requests(url)
436
+ if content: logger.info("Website scrape (Requests/BS4) OK."); user_feedback_message = None
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  else:
438
+ logger.warning(f"Website scrape failed for {url}. Trying URLToText API.");
439
  if current_urltotext_key:
440
+ await context.bot.send_chat_action(chat_id=user.id, action='typing'); content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
441
+ if content: logger.info("URLToText API scrape OK."); user_feedback_message = None
442
+ else: user_feedback_message = "Sorry, couldn't fetch web content (both methods)."
443
+ else: user_feedback_message = "Sorry, couldn't fetch web content (fallback not configured)."
 
 
 
 
 
 
 
 
 
 
 
444
  if content:
445
+ logger.info("Content found, generating summary."); await context.bot.send_chat_action(chat_id=user.id, action='typing')
 
 
446
  summary = await generate_summary(content, summary_type, current_openrouter_key)
447
+ if summary.startswith("Error:") or summary.startswith("Sorry,"): user_feedback_message = summary; logger.warning(f"Summary generation failed: {summary}")
448
+ else: logger.info("Summary generated OK. Sending."); await context.bot.send_message(chat_id=user.id, text=summary, parse_mode=ParseMode.MARKDOWN, disable_web_page_preview=True); success = True; user_feedback_message = None
449
+ elif not user_feedback_message: user_feedback_message = "Sorry, couldn't retrieve content."
450
+ if user_feedback_message and not success: logger.warning(f"Sending failure feedback: {user_feedback_message}"); await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
451
+ except Exception as e: logger.error(f"Unexpected error in callback processing: {e}", exc_info=True); await context.bot.send_message(chat_id=user.id, text="Oops! Internal error.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  finally:
453
+ logger.debug("Cleaning up status message...");
 
 
454
  try:
455
+ if message_to_delete_later: await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later.message_id); logger.debug("Deleted separate status msg.")
456
+ elif query: await query.delete_message(); logger.debug(f"Deleted original message query {query.id}.")
457
+ except Exception as del_e: logger.warning(f"Could not delete status/button message: {del_e}")
458
+ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: logger.error(f"Exception while handling update: {context.error}", exc_info=context.error)
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
  # --- Bot Application Setup Function ---
461
  async def setup_bot():
462
+ logger.info("Setting up Telegram Application...");
463
+ if not TELEGRAM_TOKEN: logger.critical("Cannot initialize: TELEGRAM_TOKEN missing."); return None
 
 
 
 
 
464
  application = Application.builder().token(TELEGRAM_TOKEN).build()
465
+ logger.info("Running application.initialize()..."); await application.initialize(); logger.info("Finished application.initialize().")
466
+ application.add_handler(CommandHandler("start", start)); application.add_handler(CommandHandler("help", help_command))
467
+ application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url)); application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
468
+ application.add_error_handler(error_handler); logger.info("Telegram handlers registered.")
469
+ return application
470
+ logger.info("Running bot setup..."); ptb_app = asyncio.run(setup_bot()); logger.info(f"Bot setup finished. App instance: {'OK' if ptb_app else 'Failed'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
 
472
  # --- Flask App Setup ---
473
+ app = Flask(__name__); logger.info("Flask app created.")
 
 
474
 
475
  # --- Webhook Endpoint ---
476
  @app.route('/webhook', methods=['POST'])
477
  async def webhook() -> Response:
 
478
  logger.info("Webhook request received...")
479
+ if not ptb_app: logger.error("PTB App not initialized."); return Response('Bot not configured', status=500)
 
 
 
480
  if request.is_json:
481
  try:
482
+ update = Update.de_json(request.get_json(), ptb_app.bot); logger.debug(f"Processing update ID: {update.update_id}")
483
+ logger.debug("Directly awaiting process_update..."); await ptb_app.process_update(update); logger.debug("Finished awaiting process_update.")
 
 
 
 
 
 
 
 
 
 
484
  return Response('ok', status=200)
485
+ except json.JSONDecodeError: logger.error("Failed JSON decode from Telegram."); return Response('Bad Request', status=400)
486
+ except Exception as e: logger.error(f"Error processing update in webhook: {e}", exc_info=True); return Response('Internal Server Error', status=500)
487
+ else: logger.warning("Received non-JSON request to webhook."); return Response('Bad Request', status=400)
 
 
 
 
 
 
 
 
488
  @app.route('/')
489
+ def index(): logger.debug("Health check '/' accessed."); bot_status = "Initialized" if ptb_app else "FAILED Init"; return f"TG Bot Webhook Listener ({bot_status}) running."
 
 
 
 
 
490
 
491
  # --- Main Execution Block ---
 
492
  if __name__ == '__main__':
493
+ if not ptb_app: logger.critical("Aborting local Flask start: PTB App failed init.")
494
+ else: logger.info("Starting Flask server directly (local testing?)..."); port = int(os.environ.get('PORT', 5000)); app.run(host='0.0.0.0', port=port, debug=True)