fmab777 commited on
Commit
6a18fe1
·
verified ·
1 Parent(s): 3753286

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +270 -637
main.py CHANGED
@@ -1,12 +1,19 @@
1
- # main.py (Revised: Apify 201 fix + Supadata verify=False + Gunicorn/Uvicorn compatible)
2
  import os
3
  import re
4
  import logging
5
  import asyncio
6
  import json
7
- import html
8
- from flask import Flask, request, Response # For web server
9
 
 
 
 
 
 
 
 
10
  from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
11
  from telegram.ext import (
12
  Application,
@@ -15,39 +22,41 @@ from telegram.ext import (
15
  filters,
16
  ContextTypes,
17
  CallbackQueryHandler,
18
- # ApplicationBuilder is implicitly used by Application.builder()
19
  )
20
- from telegram.constants import ParseMode # Import ParseMode explicitly
 
21
 
22
- # Import specific libraries (Ensure these are covered in requirements.txt)
23
  from youtube_transcript_api import YouTubeTranscriptApi
24
  import requests
25
  from bs4 import BeautifulSoup
26
- # Only import ApifyClient if you might use it (i.e., have the secret)
27
  _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
28
  if _apify_token_exists:
29
  from apify_client import ApifyClient
30
  else:
31
- ApifyClient = None # Explicitly set to None if not imported
32
 
33
  # --- Logging Setup ---
34
  logging.basicConfig(
35
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
36
- level=logging.DEBUG # Keep DEBUG for detailed info
37
  )
38
- # Reduce noise from libraries
39
  logging.getLogger("httpx").setLevel(logging.WARNING)
40
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
41
- logging.getLogger("telegram.ext").setLevel(logging.INFO) # INFO is usually enough for PTB ext
42
- logging.getLogger('telegram.bot').setLevel(logging.INFO) # INFO is usually enough for PTB bot API calls
43
  logging.getLogger("urllib3").setLevel(logging.INFO)
44
- logging.getLogger('gunicorn.error').setLevel(logging.INFO) # Gunicorn logs
45
- logging.getLogger('uvicorn.error').setLevel(logging.INFO) # Uvicorn logs
 
46
  logger = logging.getLogger(__name__)
47
- logger.info("Logging configured (Main logger: DEBUG).")
 
 
 
48
 
49
  # --- Environment Variable Loading ---
50
- logger.info("Attempting to load secrets from environment variables...")
51
  def get_secret(secret_name):
52
  logger.debug(f"Attempting to read secret: {secret_name}")
53
  value = os.environ.get(secret_name)
@@ -68,14 +77,12 @@ logger.info("Secret loading attempt finished.")
68
  def is_youtube_url(url):
69
  """Checks if the URL is a valid YouTube video or shorts URL."""
70
  youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
71
- # Added check for common query params like si= or feature=
72
  match = re.search(youtube_regex, url)
73
  logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}")
74
  return bool(match)
75
 
76
  def extract_youtube_id(url):
77
  """Extracts the YouTube video ID from a URL."""
78
- # Updated regex to better handle query parameters after the ID
79
  youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})(?:\?|&|\s|$)'
80
  match = re.search(youtube_id_regex, url)
81
  if match:
@@ -96,16 +103,12 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
96
  params = {"videoId": video_id, "format": "text"}
97
  headers = {"X-API-Key": api_key}
98
  try:
99
- # --- Keep verify=False for testing, but log clearly ---
100
  logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification)")
101
  response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
102
- # --- End verify=False section ---
103
-
104
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
105
  if response.status_code == 200:
106
  try:
107
  data = response.json()
108
- # Handle both direct string response and JSON object response
109
  content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
110
  if content and isinstance(content, str):
111
  logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
@@ -114,7 +117,6 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
114
  logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
115
  return None
116
  except json.JSONDecodeError:
117
- # If JSON fails, maybe it's plain text?
118
  if response.text:
119
  logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
120
  return response.text.strip()
@@ -138,7 +140,6 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
138
  return None
139
  except requests.exceptions.RequestException as e:
140
  logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
141
- # Log specific SSL Error details even with verify=False
142
  if isinstance(e, requests.exceptions.SSLError):
143
  logger.error(f"[Supadata] SSL Error occurred despite using verify=False. Details: {e}")
144
  return None
@@ -146,7 +147,7 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
146
  logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
147
  return None
148
 
149
- # Apify Transcript Fetching
150
  async def get_transcript_via_apify(video_url: str, api_token: str):
151
  """Fetches YouTube transcript via Apify API."""
152
  if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
@@ -182,35 +183,25 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
182
  # --- !!! FIXED FALLBACK LOGIC !!! ---
183
  if not content and item.get("captions"):
184
  captions_data = item["captions"]
185
- # Check if 'captions' is a STRING (like in the log) or a LIST of dicts
186
  if isinstance(captions_data, str):
187
  logger.info("[Apify] Processing 'captions' string format as fallback.")
188
- # Use the string directly if it seems to be the transcript
189
  content = captions_data.strip()
190
- # Basic check if it looks like a transcript vs an error message
191
  if len(content) < 50 and "error" in content.lower():
192
  logger.warning(f"[Apify] 'captions' string looks like an error: {content}")
193
- content = None # Discard if it looks like an error
194
  elif isinstance(captions_data, list):
195
  logger.info("[Apify] Processing 'captions' list format as fallback.")
196
- # Join text from list of dicts (handle potential missing 'text' keys)
197
  texts = [cap.get("text", "") for cap in captions_data if isinstance(cap, dict) and cap.get("text")]
198
  content = " ".join(texts).strip()
199
  else:
200
  logger.warning(f"[Apify] 'captions' field found but is neither string nor list: {type(captions_data)}")
201
- content = None # Cannot parse
202
 
203
- # Clean up potential HTML entities like '
204
  if content:
205
  try:
206
- import html
207
- content = html.unescape(content)
208
- except ImportError:
209
- logger.warning("[Apify] 'html' module not found, cannot unescape entities.")
210
  except Exception as unescape_err:
211
  logger.warning(f"[Apify] Error during html unescaping: {unescape_err}")
212
-
213
-
214
  # --- !!! END FIXED FALLBACK LOGIC !!! ---
215
 
216
  if content and isinstance(content, str):
@@ -223,9 +214,8 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
223
  elif not item.get("captions"):
224
  logger.warning(f"[Apify] Actor run successful ({response.status_code}) but no text/transcript/captions_concatenated/captions field found for {video_url}. Item: {item}")
225
  else:
226
- # This case means the fallback parsing above failed
227
  logger.warning(f"[Apify] Actor run successful ({response.status_code}), 'captions' field found but fallback parsing failed to extract content for {video_url}.")
228
- return None # Return None if no content extracted
229
  else:
230
  logger.warning(f"[Apify] Actor run successful ({response.status_code}) but dataset result list empty for {video_url}. Response: {results}")
231
  return None
@@ -235,7 +225,6 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
235
  except Exception as e:
236
  logger.error(f"[Apify] Error processing successful response ({response.status_code}) for {video_url}: {e}", exc_info=True)
237
  return None
238
- # ... (rest of the error handling remains the same) ...
239
  elif response.status_code == 400:
240
  logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Response: {response.text[:200]}...")
241
  return None
@@ -262,366 +251,179 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
262
  if not video_id: logger.error("get_youtube_transcript called with no video_id"); return None
263
  logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
264
  transcript_text = None
265
-
266
- # 1. Primary Method: youtube-transcript-api
267
- logger.info("[Primary YT] Attempting youtube-transcript-api...")
268
- try:
269
- # Run the blocking IO call in a separate thread
270
  transcript_list = await asyncio.to_thread(
271
  YouTubeTranscriptApi.get_transcript,
272
  video_id,
273
- languages=['en', 'en-GB', 'en-US'] # Prioritize English variants
274
  )
275
  if transcript_list:
276
  transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
277
- # Clean up excessive whitespace that might result from joining
278
  transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
279
  if transcript_text:
280
  logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
281
  return transcript_text
282
  else:
283
  logger.warning(f"[Primary YT] Joined transcript text is empty after cleaning for {video_id}")
284
- transcript_text = None # Explicitly set to None
285
  else:
286
  logger.warning(f"[Primary YT] Transcript list was empty for {video_id}")
287
  transcript_text = None
288
  except Exception as e:
289
- # Log the specific error type for better debugging
290
  logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {type(e).__name__} - {e}")
291
- # Be more specific about common errors
292
- if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e):
293
- logger.warning("[Primary YT] IP likely blocked by YouTube (Rate Limit / Cloud IP).")
294
- elif "No transcript found" in str(e):
295
- logger.warning(f"[Primary YT] No transcript available in specified languages for {video_id}.")
296
- elif "TranscriptsDisabled" in str(e) or "disabled" in str(e):
297
- logger.warning(f"[Primary YT] Transcripts are disabled for {video_id}.")
298
- # Ensure transcript_text is None if any exception occurred
299
  transcript_text = None
300
 
301
- # 2. Fallback 1: Supadata API
302
- if transcript_text is None:
303
  logger.info("[Fallback YT 1] Primary method failed or yielded no text. Trying Supadata API...")
304
  if supadata_key:
305
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
306
  if transcript_text:
307
  logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id} (length: {len(transcript_text)})")
308
- return transcript_text # Return immediately on success
309
- else:
310
- logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
311
- # transcript_text remains None
312
- else:
313
- logger.warning("[Fallback YT 1] Supadata API key not available. Skipping.")
314
 
315
- # 3. Fallback 2: Apify API
316
- if transcript_text is None:
317
  logger.info("[Fallback YT 2] Primary & Supadata failed or yielded no text. Trying Apify API...")
318
  if apify_token:
319
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
320
  if transcript_text:
321
  logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url} (length: {len(transcript_text)})")
322
- return transcript_text # Return immediately on success
323
- else:
324
- logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
325
- # transcript_text remains None
326
- else:
327
- logger.warning("[Fallback YT 2] Apify API token not available. Skipping.")
328
 
329
- # If all methods failed
330
  if transcript_text is None:
331
  logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
332
  return None
 
333
 
334
- # This line should technically not be reached if logic is correct, but added for safety
335
- return transcript_text
336
-
337
- # Website Content via Requests/BS4 (Primary Method for Simplified Bot)
338
  async def get_website_content_via_requests(url):
339
- """Attempts to scrape website content using requests/BeautifulSoup (Primary Method)."""
340
  if not url: logger.error("[Web Scraper - Requests/BS4] called with no URL"); return None
341
  logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
342
  try:
343
- # Standard headers to mimic a browser
344
- headers = {
345
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
346
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
347
- 'Accept-Language': 'en-US,en;q=0.9',
348
- 'Connection': 'keep-alive',
349
- 'DNT': '1', # Do Not Track
350
- 'Upgrade-Insecure-Requests': '1'
351
- }
352
  logger.debug(f"[Web Scraper - Requests/BS4] Sending GET request to {url}")
353
- # Run blocking requests call in thread
354
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
355
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
356
  logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
357
-
358
  content_type = response.headers.get('content-type', '').lower()
359
  if 'html' not in content_type:
360
- logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}. Skipping parsing.")
361
- # Maybe return raw text if it's plain text? Or just None.
362
- if 'text/plain' in content_type and response.text:
363
- logger.info(f"[Web Scraper - Requests/BS4] Returning plain text content for {url}")
364
- return response.text.strip()
365
  return None
366
-
367
- # Parse with BeautifulSoup
368
  soup = BeautifulSoup(response.text, 'html.parser')
369
-
370
- # Remove common unwanted elements
371
- for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
372
- element.extract() # Remove the tag from the tree
373
-
374
- # Try to find the main content area (add more selectors if needed)
375
- main_content = (
376
- soup.find('main') or
377
- soup.find('article') or
378
- soup.find(id='content') or
379
- soup.find(class_='content') or
380
- soup.find(id='main-content') or
381
- soup.find(class_='main-content') or
382
- soup.find(role='main')
383
- )
384
-
385
- # Fallback to body if no specific main area found
386
  target_element = main_content if main_content else soup.body
387
-
388
  if not target_element:
389
- logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}")
390
- # Try getting text from the whole soup as a last resort? Could be messy.
391
- # raw_text = soup.get_text(separator='\n', strip=True)
392
- # if raw_text: return "\n".join(line.strip() for line in raw_text.splitlines() if line.strip())
393
- return None # Return None if body itself is missing
394
-
395
- # Extract text, clean up lines, and join
396
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
397
  text = "\n".join(lines)
398
-
399
- # Check if extracted text is reasonably long
400
  MIN_TEXT_LENGTH = 50
401
  if not text or len(text) < MIN_TEXT_LENGTH:
402
- logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short (<{MIN_TEXT_LENGTH} chars) or empty after cleaning for {url} (Length: {len(text)})")
403
- # Consider returning None if text is too short, might indicate failed extraction
404
- # return None
405
-
406
  logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
407
  return text
 
 
 
 
408
 
409
- except requests.exceptions.Timeout:
410
- logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}"); return None
411
- except requests.exceptions.TooManyRedirects:
412
- logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}"); return None
413
- except requests.exceptions.RequestException as e: # Catches HTTPError, ConnectionError etc.
414
- logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}"); return None
415
- except Exception as e: # Catch potential BS4 errors or others
416
- logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True); return None
417
-
418
-
419
- # Website Content via URLToText API (Fallback Method)
420
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
421
- """Fetches website content using the URLToText API (Fallback)."""
422
  if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
423
  if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
424
-
425
- logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}")
426
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
427
- # Ensure payload is correctly formatted JSON string
428
- payload = json.dumps({
429
- "url": url,
430
- "output_format": "text",
431
- "extract_main_content": True, # Use their main content extraction
432
- "render_javascript": True, # Important for dynamic sites
433
- "residential_proxy": False # Usually not needed unless specifically blocked
434
- })
435
- headers = {
436
- "Authorization": f"Token {api_key}",
437
- "Content-Type": "application/json"
438
- }
439
-
440
  try:
441
  logger.debug(f"[Web Scraper - URLToText API] Sending POST request for {url}")
442
- # Run blocking requests call in thread
443
- response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45) # Slightly longer timeout
444
  logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
445
-
446
  if response.status_code == 200:
447
  try:
448
  data = response.json()
449
- content = data.get("data", {}).get("content") # Navigate nested structure
450
  credits = data.get("credits_used", "N/A")
451
- warning = data.get("data", {}).get("warning") # Check for warnings
452
-
453
- if warning:
454
- logger.warning(f"[Web Scraper - URLToText API] Warning received for {url}: {warning}")
455
-
456
  if content:
457
  logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}")
458
  return content.strip()
459
  else:
460
- logger.warning(f"[Web Scraper - URLToText API] API returned success (200) but content was empty for {url}. Response: {data}")
461
- return None # Return None if content is missing despite 200 OK
462
-
463
- except json.JSONDecodeError:
464
- logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Status: {response.status_code}. Response text: {response.text[:500]}...")
465
- return None
466
- except Exception as e:
467
- logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True)
468
- return None
469
-
470
- # Handle specific error codes from URLToText API docs
471
- elif response.status_code == 400: # Bad Request (e.g., invalid URL format)
472
- logger.error(f"[Web Scraper - URLToText API] Bad Request (400) from API for {url}. Check URL/payload. Response: {response.text[:200]}...")
473
- elif response.status_code == 401: # Unauthorized (Invalid API Key)
474
- logger.error(f"[Web Scraper - URLToText API] Unauthorized (401) from API for {url}. Check API Key. Response: {response.text[:200]}...")
475
- elif response.status_code == 402: # Payment Required (Credits exhausted)
476
- logger.error(f"[Web Scraper - URLToText API] Payment Required (402) from API for {url}. Check credits. Response: {response.text[:200]}...")
477
- elif response.status_code == 422: # Unprocessable Entity (e.g., URL cannot be reached)
478
- logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL (422) reported by API for {url}. Response: {response.text[:200]}...")
479
- elif response.status_code == 500: # Internal Server Error on their end
480
- logger.error(f"[Web Scraper - URLToText API] Internal Server Error (500) from API for {url}. Response: {response.text[:200]}...")
481
- else: # Catch-all for other unexpected codes
482
- logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}...")
483
- return None # Return None for all error cases
484
-
485
- except requests.exceptions.Timeout:
486
- logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}")
487
- return None
488
- except requests.exceptions.RequestException as e:
489
- logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}")
490
- return None
491
- except Exception as e:
492
- logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True)
493
  return None
 
 
 
494
 
495
- # DeepSeek Summary Function (via OpenRouter)
496
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
497
  """Generates summary using DeepSeek via OpenRouter API."""
498
- logger.info(f"Generating '{summary_type}' summary using DeepSeek/OpenRouter. Input text length: {len(text)}")
499
- if not api_key:
500
- logger.error("OpenRouter API key was not provided.")
501
- return "Error: AI model configuration key is missing."
502
- if not text:
503
- logger.warning("generate_summary called with empty text.")
504
- return "Error: No content provided to summarize."
505
-
506
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
507
- # Using the free DeepSeek model - be mindful of potential rate limits or changes
508
  model_name = "deepseek/deepseek-chat:free"
509
-
510
- # Define prompts based on summary type
511
- if summary_type == "paragraph":
512
- prompt = "Please provide a concise, well-written paragraph summarizing the key information and main points of the following text. Focus on capturing the essence of the content accurately."
513
- elif summary_type == "points":
514
- prompt = "Please summarize the following text into clear, distinct bullet points. Each point should highlight a key piece of information, finding, or main topic discussed. Aim for clarity and conciseness."
515
- else:
516
- logger.error(f"Invalid summary_type '{summary_type}' passed to generate_summary.")
517
- return f"Error: Invalid summary type requested ('{summary_type}')."
518
-
519
- # Check and truncate input text if necessary
520
- # Max context varies per model, DeepSeek Chat's is large, but setting a practical limit is wise.
521
- # Let's use a large but reasonable limit to avoid huge API calls.
522
- MAX_INPUT_LENGTH = 500000 # Approx 500k characters
523
- if len(text) > MAX_INPUT_LENGTH:
524
- logger.warning(f"Input text length ({len(text)}) exceeds maximum limit ({MAX_INPUT_LENGTH}). Truncating.")
525
- text = text[:MAX_INPUT_LENGTH] + "... (Content truncated due to length)"
526
-
527
  full_prompt = f"{prompt}\n\n--- Start of Text ---\n\n{text}\n\n--- End of Text ---"
528
-
529
- headers = {
530
- "Authorization": f"Bearer {api_key}",
531
- "Content-Type": "application/json",
532
- # Recommended headers for OpenRouter when calling from identifiable services
533
- "HTTP-Referer": "https://huggingface.co/spaces/YOUR_SPACE_NAME", # TODO: Replace with your actual space name if possible
534
- "X-Title": "Telegram URL Summarizer Bot (HF Space)" # Or your bot's name
535
- }
536
-
537
- payload = json.dumps({
538
- "model": model_name,
539
- "messages": [
540
- {"role": "user", "content": full_prompt}
541
- ],
542
- # Optional parameters (adjust if needed):
543
- # "temperature": 0.7, # Controls randomness (optional)
544
- # "max_tokens": 1024, # Limit response length (optional)
545
- })
546
-
547
  try:
548
  logger.debug(f"Sending request to OpenRouter (Model: {model_name})...")
549
- # Run blocking requests call in thread
550
- response = await asyncio.to_thread(requests.post,
551
- openrouter_api_endpoint,
552
- headers=headers,
553
- data=payload,
554
- timeout=90 # Increased timeout for potentially long generation
555
- )
556
  logger.debug(f"Received status code {response.status_code} from OpenRouter.")
557
-
558
  if response.status_code == 200:
559
  try:
560
  data = response.json()
561
- # Check the response structure carefully
562
  if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
563
  message = data["choices"][0].get("message")
564
  if message and message.get("content"):
565
  summary = message["content"].strip()
566
- # Basic check for empty or placeholder summary
567
- if summary:
568
- logger.info(f"Successfully generated summary via OpenRouter. Length: {len(summary)}")
569
- return summary
570
- else:
571
- logger.warning(f"OpenRouter returned success (200) but summary content is empty. Response: {data}")
572
- return "Sorry, the AI model returned an empty summary."
573
- else:
574
- logger.warning(f"OpenRouter success (200) but response structure missing expected content. Response: {data}")
575
- return "Sorry, could not parse the AI model's response (missing content)."
576
- # Handle cases where 'choices' might be missing or empty, or if there's an error object
577
- elif data.get("error"):
578
- error_details = data["error"]
579
- logger.error(f"OpenRouter API Error ({response.status_code}): {error_details}")
580
- return f"Sorry, the AI service reported an error: {error_details.get('message', 'Unknown error')}"
581
- else:
582
- logger.error(f"OpenRouter success (200) but unexpected response structure (no choices/error). Response: {data}")
583
- return "Sorry, could not parse the AI model's response (unexpected structure)."
584
-
585
- except json.JSONDecodeError:
586
- logger.error(f"Failed to decode JSON response from OpenRouter. Status: {response.status_code}. Response: {response.text[:500]}...")
587
- return "Sorry, failed to understand the AI model's response format."
588
- except Exception as e:
589
- logger.error(f"Error processing successful OpenRouter response: {e}", exc_info=True)
590
- return "Sorry, an error occurred while processing the AI model's response."
591
-
592
- # Handle specific HTTP error codes
593
- elif response.status_code == 401: # Unauthorized
594
- logger.error("OpenRouter API key is invalid or unauthorized (401). Check the key in HF Secrets.")
595
- return "Error: AI model configuration key is invalid."
596
- elif response.status_code == 402: # Payment Required / Quota Exceeded
597
- logger.error("OpenRouter Payment Required / Quota Exceeded (402). Check OpenRouter account status and limits.")
598
- return "Sorry, there's an issue with the AI service account (limits or payment)."
599
- elif response.status_code == 429: # Rate Limit Exceeded
600
- logger.warning("OpenRouter Rate Limit Exceeded (429). Need to wait or slow down requests.")
601
- return "Sorry, the AI model service is busy right now. Please try again in a moment."
602
- elif response.status_code >= 500: # Server Error on OpenRouter's side
603
- logger.error(f"OpenRouter Internal Server Error ({response.status_code}). Response: {response.text[:500]}...")
604
- return "Sorry, the AI model service encountered an internal error. Please try again later."
605
- else: # Catch-all for other client-side or unexpected errors
606
- logger.error(f"Unexpected status code {response.status_code} received from OpenRouter. Response: {response.text[:500]}...")
607
- try:
608
- # Try to parse error message from response if possible
609
- error_data = response.json()
610
- error_msg = error_data.get("error", {}).get("message", response.text[:100])
611
- return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
612
- except:
613
- # Fallback if response is not JSON or doesn't have expected error structure
614
- return f"Sorry, the AI service returned an unexpected status code ({response.status_code})."
615
-
616
- except requests.exceptions.Timeout:
617
- logger.error("Timeout occurred while connecting to OpenRouter.")
618
- return "Sorry, the request to the AI model timed out. Please try again."
619
- except requests.exceptions.RequestException as e:
620
- logger.error(f"Network error connecting to OpenRouter: {e}")
621
- return "Sorry, there was a network error connecting to the AI model service."
622
- except Exception as e:
623
- logger.error(f"Unexpected error occurred in generate_summary function: {e}", exc_info=True)
624
- return "Sorry, an unexpected internal error occurred while generating the summary."
625
 
626
 
627
  # --- Telegram Bot Handlers ---
@@ -629,361 +431,196 @@ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
629
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
630
  """Handles the /start command."""
631
  user = update.effective_user
632
- if not user: return # Should not happen but good practice
633
  logger.info(f"User {user.id} ({user.username or 'NoUsername'}) triggered /start.")
634
- # Use mention_html for a clickable link if username exists, otherwise just first name
635
  mention = user.mention_html() if user.username else user.first_name
636
- await update.message.reply_html(
637
- f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\n"
638
- "Just send me a valid link (starting with http:// or https://) and I'll ask you how you want it summarized."
639
- )
640
 
641
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
642
  """Handles the /help command."""
643
  user = update.effective_user
644
  logger.info(f"User {user.id if user else 'Unknown'} triggered /help.")
645
- help_text = (
646
- "**How I Work:**\n\n"
647
- "1. Send me a full URL (starting with `http://` or `https://`).\n"
648
- "2. I'll detect if it's a YouTube video link or a general website URL.\n"
649
- "3. I'll ask if you want a **Paragraph** summary or **Points** summary.\n"
650
- "4. Choose your preferred format by clicking the button.\n"
651
- "5. I'll fetch the content (transcript for YouTube, text for websites) and use an AI model (via OpenRouter) to generate the summary.\n\n"
652
- "**Troubleshooting:**\n"
653
- "- **YouTube:** Sometimes transcripts aren't available (private video, no captions, disabled). I use multiple methods (library, Supadata, Apify) to try and get them.\n"
654
- "- **Websites:** Complex websites with lots of JavaScript might be difficult to scrape accurately. I use a primary scraping method and a fallback API (URLToText) if needed.\n"
655
- "- **AI Errors:** Occasionally, the AI model might be busy or encounter an error. You can try again later.\n\n"
656
- "Just send a link to get started!"
657
- )
658
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
659
 
660
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
661
  """Handles messages containing potential URLs."""
662
- if not update.message or not update.message.text:
663
- return # Ignore empty messages
664
-
665
- message_text = update.message.text.strip()
666
- user = update.effective_user
667
  if not user: return
668
-
669
- # Simple check for potential URLs - refine if needed
670
- # Looks for http:// or https:// followed by some characters and a dot.
671
- url_pattern = r'https?://[^\s/$.?#].[^\s]*'
672
- match = re.search(url_pattern, message_text)
673
-
674
  if match:
675
- url = match.group(0) # Extract the first matched URL
676
- logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}")
677
-
678
- # Store the URL in user_data, associated with the user ID
679
- context.user_data['url_to_summarize'] = url
680
- logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
681
-
682
- # Define the inline keyboard buttons
683
- keyboard = [
684
- [
685
- InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"),
686
- InlineKeyboardButton("Points Summary", callback_data="points")
687
- ]
688
- ]
689
  reply_markup = InlineKeyboardMarkup(keyboard)
690
-
691
- # Reply to the user, asking for the summary type
692
- await update.message.reply_text(
693
- f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?",
694
- reply_markup=reply_markup,
695
- # disable_web_page_preview=True # Good practice to avoid double previews
696
- link_preview_options={'is_disabled': True} # Updated way for PTB v20+
697
- )
698
- else:
699
- # If the message doesn't look like a URL, optionally reply or just ignore
700
- logger.debug(f"Ignoring non-URL message from user {user.id}: {message_text[:100]}")
701
- # Optional: Reply if you want to guide the user
702
- # await update.message.reply_text("Please send me a valid URL starting with http:// or https://")
703
-
704
 
705
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
706
  """Handles button presses for choosing the summary type."""
707
- query = update.callback_query
708
- if not query: return
709
- user = query.from_user # Get user from the callback query
710
-
711
- # --- 1. Answer the callback query ---
712
- # It's crucial to answer quickly to remove the "loading" state on the button.
713
- try:
714
- await query.answer()
715
- logger.debug(f"Answered callback query {query.id}")
716
- except Exception as e:
717
- # This can happen if the query is too old, but log it.
718
- logger.error(f"Failed to answer callback query {query.id}: {e}")
719
- # Don't stop processing, but be aware the button might stay loading for the user.
720
-
721
- # --- 2. Get data and context ---
722
- summary_type = query.data # e.g., "paragraph" or "points"
723
- # Retrieve the URL stored earlier for this user
724
- url = context.user_data.get('url_to_summarize')
725
-
726
- logger.info(f"User {user.id} ({user.username or 'NoUsername'}) chose '{summary_type}' summary. Checking for URL '{url}' in context.")
727
-
728
  if not url:
729
- logger.warning(f"User {user.id} pressed button, but NO URL was found in user_data context. Query data: {summary_type}")
730
- # Edit the original message to inform the user
731
- try:
732
- await query.edit_message_text(text="Sorry, I seem to have lost the context. 🤔 Please send the link again.")
733
- except Exception as edit_err:
734
- logger.error(f"Failed to edit message after lost context error: {edit_err}")
735
- return # Stop processing
736
-
737
- # Clear the URL from context once we start processing it
738
- context.user_data.pop('url_to_summarize', None)
739
- logger.debug(f"Retrieved and cleared URL {url} from user_data for user {user.id}")
740
-
741
- # --- 3. Check for required API keys ---
742
- # Re-fetch keys inside handler in case they were updated (less critical here, but good practice)
743
- current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
744
- current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
745
- current_supadata_key = os.environ.get('SUPADATA_API_KEY')
746
- current_apify_token = os.environ.get('APIFY_API_TOKEN')
747
-
748
- logger.debug(f"Reading API keys from environment variables within handler...")
749
- logger.debug(f"Keys read: OpenRouter={'Yes' if current_openrouter_key else 'No'}, "
750
- f"URLToText={'Yes' if current_urltotext_key else 'No'}, "
751
- f"Supadata={'Yes' if current_supadata_key else 'No'}, "
752
- f"Apify={'Yes' if current_apify_token else 'No'}")
753
-
754
-
755
  if not current_openrouter_key:
756
- logger.error("OpenRouter API key is missing in environment variables. Cannot generate summary.")
757
- try:
758
- await query.edit_message_text(text="⚠️ Error: The AI summarization service is not configured correctly (missing API key). Please contact the bot admin.")
759
- except Exception as edit_err:
760
- logger.error(f"Failed to edit message about missing OpenRouter key: {edit_err}")
761
  return
762
-
763
- # --- 4. Edit message to show "Processing..." ---
764
- processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{url}\n\nThis might take a moment..."
765
- message_to_delete_later_id = None
766
- try:
767
- # Edit the original message where the buttons were
768
- await query.edit_message_text(text=processing_message_text)
769
- logger.debug(f"Edited message for query {query.id} to show processing status.")
770
  except Exception as e:
771
- # If editing fails (e.g., message too old, deleted), send a new message
772
- logger.warning(f"Could not edit original message {query.message.message_id if query.message else 'N/A'} (query {query.id}): {e}. Sending a new status message.")
773
- try:
774
- status_message = await context.bot.send_message(chat_id=user.id, text=processing_message_text)
775
- message_to_delete_later_id = status_message.message_id
776
- logger.debug(f"Sent new status message {message_to_delete_later_id}")
777
- except Exception as send_err:
778
- logger.error(f"Failed even to send a new status message: {send_err}")
779
- # Can't easily inform the user now, just log.
780
-
781
- # --- 5. Fetch Content (YouTube or Website) ---
782
- content = None
783
- user_feedback_message = None # Store potential error messages for the user
784
- success = False # Track overall success
785
-
786
  try:
787
- # Send "typing..." action to indicate activity
788
- logger.debug(f"Sending 'typing' action for chat {user.id}")
789
- await context.bot.send_chat_action(chat_id=user.id, action='typing')
790
-
791
- is_yt = is_youtube_url(url)
792
- logger.debug(f"URL determined to be YouTube: {is_yt}")
793
-
794
  if is_yt:
795
  video_id = extract_youtube_id(url)
796
  if video_id:
797
- logger.info(f"Fetching YouTube transcript for video_id: {video_id}")
798
- content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
799
- if not content:
800
- # Provide a more informative error if all YT methods fail
801
- user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
802
- logger.info(f"YouTube transcript fetch completed. Content found: {bool(content)}")
803
- else:
804
- logger.warning(f"Failed to extract video ID from supposedly YouTube URL: {url}")
805
- user_feedback_message = "Sorry, I couldn't properly identify the YouTube video ID from the link."
806
  else:
807
- # Try fetching website content
808
- logger.info(f"Attempting website scrape (Requests/BS4) for: {url}")
809
- content = await get_website_content_via_requests(url)
810
-
811
- if content:
812
- logger.info("Website scrape (Requests/BS4) successful.")
813
- user_feedback_message = None # Clear any previous potential message
814
  else:
815
- logger.warning(f"Primary website scrape (Requests/BS4) failed for {url}. Trying fallback API (URLToText)...")
816
  if current_urltotext_key:
817
- # Send typing action again before potentially long API call
818
- await context.bot.send_chat_action(chat_id=user.id, action='typing')
819
- content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
820
- if content:
821
- logger.info("Website scrape fallback (URLToText API) successful.")
822
- user_feedback_message = None
823
- else:
824
- logger.warning(f"Fallback website scrape (URLToText API) also failed for {url}.")
825
- user_feedback_message = "Sorry, I couldn't fetch the content from that website using either the standard method or the fallback API."
826
- else:
827
- logger.warning("URLToText API key is not configured. Cannot use fallback.")
828
- user_feedback_message = "Sorry, I couldn't fetch the content from that website (primary method failed, and fallback is not configured)."
829
-
830
- # --- 6. Generate Summary if Content Exists ---
831
  if content:
832
- logger.info("Content retrieved successfully. Proceeding to generate summary.")
833
- # Send typing action again before AI call
834
- await context.bot.send_chat_action(chat_id=user.id, action='typing')
835
-
836
  summary = await generate_summary(content, summary_type, current_openrouter_key)
837
-
838
- # Check if the summary generation itself returned an error message
839
- if summary.startswith("Error:") or summary.startswith("Sorry,"):
840
- user_feedback_message = summary # Use the error from generate_summary
841
- logger.warning(f"Summary generation failed or returned an error message: {summary}")
842
- else:
843
- # Success! Send the summary.
844
- logger.info("Summary generated successfully. Sending to user.")
845
- await context.bot.send_message(
846
- chat_id=user.id,
847
- text=summary,
848
- parse_mode=ParseMode.MARKDOWN, # Assume summary might contain markdown
849
- # disable_web_page_preview=True
850
- link_preview_options={'is_disabled': True}
851
- )
852
- success = True # Mark as successful
853
- user_feedback_message = None # Clear any potential previous error
854
- elif not user_feedback_message:
855
- # If content is None and no specific error message was set, provide a generic one
856
- logger.error(f"Content fetching resulted in None, but no specific user feedback message was set for URL: {url}")
857
- user_feedback_message = "Sorry, I was unable to retrieve any content from the provided link."
858
-
859
- # --- 7. Send Final Feedback (if error occurred) ---
860
- if user_feedback_message and not success:
861
- logger.warning(f"Sending failure feedback message to user {user.id}: {user_feedback_message}")
862
- await context.bot.send_message(
863
- chat_id=user.id,
864
- text=user_feedback_message
865
- )
866
-
867
- except Exception as e:
868
- # Catch any unexpected errors during the whole process
869
- logger.error(f"An unexpected error occurred in handle_summary_type_callback for user {user.id}, URL {url}: {e}", exc_info=True)
870
- try:
871
- # Try to inform the user about the internal error
872
- await context.bot.send_message(chat_id=user.id, text=" माफी माग्छु ! An unexpected internal error occurred while processing your request. The developers have been notified.") # माफी माग्छु = Oops!
873
- except Exception as final_err:
874
- logger.error(f"Failed to even send the final error message to user {user.id}: {final_err}")
875
-
876
  finally:
877
- # --- 8. Clean up the "Processing..." message ---
878
- # This runs whether success or failure, unless an error prevented it
879
- logger.debug("Callback handler finished. Cleaning up status message (if possible)...")
880
  try:
881
- if message_to_delete_later_id:
882
- # If we sent a separate status message, delete it
883
- await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later_id)
884
- logger.debug(f"Deleted separate status message {message_to_delete_later_id}.")
885
- elif query.message:
886
- # If we successfully edited the original message, delete it now that we're done
887
- # (or you could edit it to "Summary complete!" or the error message)
888
- # Deleting is cleaner usually.
889
- await query.delete_message()
890
- logger.debug(f"Deleted original message (via query {query.id}, message_id {query.message.message_id}).")
891
- except Exception as del_err:
892
- # Log if deletion fails, but don't crash
893
- logger.warning(f"Could not delete status/button message: {del_err}")
894
-
895
 
896
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
897
  """Log Errors caused by Updates."""
898
  logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
899
 
900
- # Optional: Inform user about generic errors (be cautious with this)
901
- # try:
902
- # if isinstance(update, Update) and update.effective_chat:
903
- # await context.bot.send_message(
904
- # chat_id=update.effective_chat.id,
905
- # text="Sorry, something went wrong processing your request."
906
- # )
907
- # except Exception as e:
908
- # logger.error(f"Failed to send error message to chat: {e}")
909
-
910
-
911
- # --- Bot Application Setup Function ---
912
- # Make setup_bot an async function as Application.initialize() is async
913
- async def setup_bot():
914
- """Initializes and configures the Telegram Bot Application."""
915
- logger.info("Setting up Telegram Application...")
916
  if not TELEGRAM_TOKEN:
917
- logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found. Bot cannot start.")
918
- return None
919
-
920
- # Use Application.builder() for setup
921
  application = Application.builder().token(TELEGRAM_TOKEN).build()
922
-
923
- # --- Register Handlers ---
924
- # Command Handlers
925
  application.add_handler(CommandHandler("start", start))
926
  application.add_handler(CommandHandler("help", help_command))
927
-
928
- # Message Handler for URLs (ensure it doesn't catch commands)
929
- # Using filters.TEXT & ~filters.COMMAND & filters.Entity("url") | filters.Entity("text_link") might be more precise
930
- # But a simple text check + regex inside the handler is often robust enough.
931
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
932
-
933
- # Callback Query Handler for button presses
934
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
935
-
936
- # Error Handler (registers the function defined above)
937
  application.add_error_handler(error_handler)
938
-
939
- logger.info("Telegram handlers registered.")
940
-
941
- # Initialize the application (fetches bot info, etc.)
942
- # This is now done automatically when application runs, but explicit call ensures it happens early
943
- try:
944
- logger.info("Running application.initialize()...")
945
- await application.initialize()
946
- logger.info("Finished application.initialize(). Bot details: %s", application.bot.username)
947
- except Exception as e:
948
- logger.critical(f"Failed to initialize Telegram application: {e}", exc_info=True)
949
- return None # Indicate failure
950
-
951
  return application
952
 
953
- # --- Global Application Instance ---
954
- # Run setup_bot once at startup
955
- logger.info("Running bot setup at startup...")
956
- # Use asyncio.get_event_loop().run_until_complete() if running outside an existing async context
957
- # In newer Python versions, asyncio.run() is preferred but might cause issues if nested.
958
- # Since this is top-level before Flask/Gunicorn starts its loop, it should be okay.
959
- ptb_app: Application | None = asyncio.run(setup_bot())
960
- logger.info(f"Bot setup finished. Application instance: {'OK' if ptb_app else 'FAILED'}")
961
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
962
 
963
- # --- Flask App Setup (for Webhook) ---
964
- # Import the WSGI-to-ASGI bridge
965
- from asgiref.wsgi import WsgiToAsgi
966
-
967
- logger.info("Flask app setup initiating...")
968
 
969
- # Create the core Flask app instance (give it a distinct name)
970
- flask_app = Flask(__name__)
971
- logger.info("Core Flask app instance created.")
972
 
973
- # --- Define Flask Routes on flask_app ---
974
- @flask_app.route('/')
975
  def index():
976
  """Basic health check endpoint."""
977
  logger.debug("Health check '/' accessed.")
978
- bot_status = "Initialized" if ptb_app and ptb_app.bot else "Initialization FAILED"
979
- return f"Telegram Bot Webhook Listener ({bot_status}) is running."
 
 
980
 
981
- @flask_app.route('/webhook', methods=['POST'])
982
  async def webhook() -> Response:
983
  """Webhook endpoint for Telegram updates."""
984
- if not ptb_app:
985
- logger.error("Webhook triggered, but Telegram Application (ptb_app) is not initialized.")
986
- return Response('Bot not configured correctly.', status=500)
 
987
 
988
  logger.debug("Webhook request received (POST)...")
989
  if request.is_json:
@@ -991,35 +628,31 @@ async def webhook() -> Response:
991
  update_data = request.get_json()
992
  update = Update.de_json(update_data, ptb_app.bot)
993
  logger.debug(f"Processing update ID: {update.update_id} via webhook")
994
- await ptb_app.process_update(update)
995
  logger.debug(f"Finished processing update ID: {update.update_id}")
996
- return Response('ok', status=200) # Acknowledge receipt to Telegram
997
-
998
- except json.JSONDecodeError:
999
- logger.error("Failed to decode JSON from Telegram webhook request.")
1000
- return Response('Bad Request: Invalid JSON', status=400)
1001
- except Exception as e:
1002
- logger.error(f"Error processing update in webhook: {e}", exc_info=True)
1003
- return Response('Internal Server Error processing update.', status=500)
1004
- else:
1005
- logger.warning("Received non-JSON request to webhook endpoint.")
1006
- return Response('Bad Request: Expected JSON', status=400)
1007
-
1008
- # --- Wrap the Flask WSGI app into an ASGI app ---
1009
- # Gunicorn/Uvicorn will look for this 'app' object by default (main:app)
1010
- app = WsgiToAsgi(flask_app)
1011
- logger.info("Flask WSGI app wrapped with WsgiToAsgi for ASGI compatibility.")
1012
 
1013
 
1014
  # --- Main Execution Block (for local testing ONLY) ---
1015
  if __name__ == '__main__':
1016
- # This block remains the same, but note it runs the core flask_app
1017
- # using Flask's development server, NOT the wrapped ASGI app.
1018
- logger.warning("Running Flask development server directly (for local testing only).")
1019
- if not ptb_app:
1020
- logger.critical("Aborting local Flask start: Telegram App (ptb_app) failed initialization.")
1021
  else:
1022
  local_port = int(os.environ.get('PORT', 8080))
1023
  logger.info(f"Flask dev server starting on http://0.0.0.0:{local_port}")
1024
- # Run the original Flask app for local dev server
1025
- flask_app.run(host='0.0.0.0', port=local_port, debug=True)
 
1
+ # main.py (Revised: Starlette Lifespan for PTB Initialization)
2
  import os
3
  import re
4
  import logging
5
  import asyncio
6
  import json
7
+ import html # For unescaping HTML entities
8
+ import contextlib # For async context manager (lifespan)
9
 
10
+ # --- Frameworks ---
11
+ from flask import Flask, request, Response # Core web routes
12
+ from starlette.applications import Starlette # ASGI App & Lifespan
13
+ from starlette.routing import Mount # Mount Flask within Starlette
14
+ from starlette.middleware.wsgi import WSGIMiddleware # Wrap Flask for Starlette
15
+
16
+ # --- Telegram Bot ---
17
  from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
18
  from telegram.ext import (
19
  Application,
 
22
  filters,
23
  ContextTypes,
24
  CallbackQueryHandler,
 
25
  )
26
+ from telegram.constants import ParseMode
27
+ from telegram.error import NetworkError # For specific error handling if needed
28
 
29
+ # --- Other Libraries ---
30
  from youtube_transcript_api import YouTubeTranscriptApi
31
  import requests
32
  from bs4 import BeautifulSoup
 
33
  _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
34
  if _apify_token_exists:
35
  from apify_client import ApifyClient
36
  else:
37
+ ApifyClient = None
38
 
39
  # --- Logging Setup ---
40
  logging.basicConfig(
41
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
42
+ level=logging.DEBUG
43
  )
 
44
  logging.getLogger("httpx").setLevel(logging.WARNING)
45
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
46
+ logging.getLogger("telegram.ext").setLevel(logging.INFO)
47
+ logging.getLogger('telegram.bot').setLevel(logging.INFO)
48
  logging.getLogger("urllib3").setLevel(logging.INFO)
49
+ logging.getLogger('gunicorn.error').setLevel(logging.INFO)
50
+ logging.getLogger('uvicorn').setLevel(logging.INFO) # Uvicorn logs (incl. access)
51
+ logging.getLogger('starlette').setLevel(logging.INFO) # Starlette logs
52
  logger = logging.getLogger(__name__)
53
+ logger.info("Logging configured.")
54
+
55
+ # --- Global variable for PTB app (initialized during lifespan) ---
56
+ ptb_app: Application | None = None
57
 
58
  # --- Environment Variable Loading ---
59
+ logger.info("Attempting to load secrets...")
60
  def get_secret(secret_name):
61
  logger.debug(f"Attempting to read secret: {secret_name}")
62
  value = os.environ.get(secret_name)
 
77
  def is_youtube_url(url):
78
  """Checks if the URL is a valid YouTube video or shorts URL."""
79
  youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
 
80
  match = re.search(youtube_regex, url)
81
  logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}")
82
  return bool(match)
83
 
84
  def extract_youtube_id(url):
85
  """Extracts the YouTube video ID from a URL."""
 
86
  youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})(?:\?|&|\s|$)'
87
  match = re.search(youtube_id_regex, url)
88
  if match:
 
103
  params = {"videoId": video_id, "format": "text"}
104
  headers = {"X-API-Key": api_key}
105
  try:
 
106
  logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification)")
107
  response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
 
 
108
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
109
  if response.status_code == 200:
110
  try:
111
  data = response.json()
 
112
  content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
113
  if content and isinstance(content, str):
114
  logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
 
117
  logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
118
  return None
119
  except json.JSONDecodeError:
 
120
  if response.text:
121
  logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
122
  return response.text.strip()
 
140
  return None
141
  except requests.exceptions.RequestException as e:
142
  logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
 
143
  if isinstance(e, requests.exceptions.SSLError):
144
  logger.error(f"[Supadata] SSL Error occurred despite using verify=False. Details: {e}")
145
  return None
 
147
  logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
148
  return None
149
 
150
+ # Apify Transcript Fetching (with fixed fallback parsing)
151
  async def get_transcript_via_apify(video_url: str, api_token: str):
152
  """Fetches YouTube transcript via Apify API."""
153
  if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
 
183
  # --- !!! FIXED FALLBACK LOGIC !!! ---
184
  if not content and item.get("captions"):
185
  captions_data = item["captions"]
 
186
  if isinstance(captions_data, str):
187
  logger.info("[Apify] Processing 'captions' string format as fallback.")
 
188
  content = captions_data.strip()
 
189
  if len(content) < 50 and "error" in content.lower():
190
  logger.warning(f"[Apify] 'captions' string looks like an error: {content}")
191
+ content = None
192
  elif isinstance(captions_data, list):
193
  logger.info("[Apify] Processing 'captions' list format as fallback.")
 
194
  texts = [cap.get("text", "") for cap in captions_data if isinstance(cap, dict) and cap.get("text")]
195
  content = " ".join(texts).strip()
196
  else:
197
  logger.warning(f"[Apify] 'captions' field found but is neither string nor list: {type(captions_data)}")
198
+ content = None
199
 
 
200
  if content:
201
  try:
202
+ content = html.unescape(content) # Use imported html module
 
 
 
203
  except Exception as unescape_err:
204
  logger.warning(f"[Apify] Error during html unescaping: {unescape_err}")
 
 
205
  # --- !!! END FIXED FALLBACK LOGIC !!! ---
206
 
207
  if content and isinstance(content, str):
 
214
  elif not item.get("captions"):
215
  logger.warning(f"[Apify] Actor run successful ({response.status_code}) but no text/transcript/captions_concatenated/captions field found for {video_url}. Item: {item}")
216
  else:
 
217
  logger.warning(f"[Apify] Actor run successful ({response.status_code}), 'captions' field found but fallback parsing failed to extract content for {video_url}.")
218
+ return None
219
  else:
220
  logger.warning(f"[Apify] Actor run successful ({response.status_code}) but dataset result list empty for {video_url}. Response: {results}")
221
  return None
 
225
  except Exception as e:
226
  logger.error(f"[Apify] Error processing successful response ({response.status_code}) for {video_url}: {e}", exc_info=True)
227
  return None
 
228
  elif response.status_code == 400:
229
  logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Response: {response.text[:200]}...")
230
  return None
 
251
  if not video_id: logger.error("get_youtube_transcript called with no video_id"); return None
252
  logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
253
  transcript_text = None
254
+ try: # Wrap primary method in try/except
255
+ logger.info("[Primary YT] Attempting youtube-transcript-api...")
 
 
 
256
  transcript_list = await asyncio.to_thread(
257
  YouTubeTranscriptApi.get_transcript,
258
  video_id,
259
+ languages=['en', 'en-GB', 'en-US']
260
  )
261
  if transcript_list:
262
  transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
 
263
  transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
264
  if transcript_text:
265
  logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
266
  return transcript_text
267
  else:
268
  logger.warning(f"[Primary YT] Joined transcript text is empty after cleaning for {video_id}")
269
+ transcript_text = None
270
  else:
271
  logger.warning(f"[Primary YT] Transcript list was empty for {video_id}")
272
  transcript_text = None
273
  except Exception as e:
 
274
  logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {type(e).__name__} - {e}")
275
+ if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e): logger.warning("[Primary YT] IP likely blocked by YouTube (Rate Limit / Cloud IP).")
276
+ elif "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript available in specified languages for {video_id}.")
277
+ elif "TranscriptsDisabled" in str(e) or "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts are disabled for {video_id}.")
 
 
 
 
 
278
  transcript_text = None
279
 
280
+ if transcript_text is None: # Fallback 1: Supadata
 
281
  logger.info("[Fallback YT 1] Primary method failed or yielded no text. Trying Supadata API...")
282
  if supadata_key:
283
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
284
  if transcript_text:
285
  logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id} (length: {len(transcript_text)})")
286
+ return transcript_text
287
+ else: logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
288
+ else: logger.warning("[Fallback YT 1] Supadata API key not available. Skipping.")
 
 
 
289
 
290
+ if transcript_text is None: # Fallback 2: Apify
 
291
  logger.info("[Fallback YT 2] Primary & Supadata failed or yielded no text. Trying Apify API...")
292
  if apify_token:
293
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
294
  if transcript_text:
295
  logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url} (length: {len(transcript_text)})")
296
+ return transcript_text
297
+ else: logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
298
+ else: logger.warning("[Fallback YT 2] Apify API token not available. Skipping.")
 
 
 
299
 
 
300
  if transcript_text is None:
301
  logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
302
  return None
303
+ return transcript_text # Should technically be unreachable if logic is correct
304
 
305
+ # Website Content via Requests/BS4
 
 
 
306
  async def get_website_content_via_requests(url):
307
+ """Attempts to scrape website content using requests/BeautifulSoup."""
308
  if not url: logger.error("[Web Scraper - Requests/BS4] called with no URL"); return None
309
  logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
310
  try:
311
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Language': 'en-US,en;q=0.9','Connection': 'keep-alive','DNT': '1','Upgrade-Insecure-Requests': '1'}
 
 
 
 
 
 
 
 
312
  logger.debug(f"[Web Scraper - Requests/BS4] Sending GET request to {url}")
 
313
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
314
+ response.raise_for_status()
315
  logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
 
316
  content_type = response.headers.get('content-type', '').lower()
317
  if 'html' not in content_type:
318
+ logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received: {content_type}. Trying plain text.")
319
+ if 'text/plain' in content_type and response.text: return response.text.strip()
 
 
 
320
  return None
 
 
321
  soup = BeautifulSoup(response.text, 'html.parser')
322
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]): element.extract()
323
+ main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  target_element = main_content if main_content else soup.body
 
325
  if not target_element:
326
+ logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content for {url}")
327
+ return None
 
 
 
 
 
328
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
329
  text = "\n".join(lines)
 
 
330
  MIN_TEXT_LENGTH = 50
331
  if not text or len(text) < MIN_TEXT_LENGTH:
332
+ logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short (<{MIN_TEXT_LENGTH} chars) for {url} (Length: {len(text)})")
 
 
 
333
  logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
334
  return text
335
+ except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping {url}"); return None
336
+ except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Too many redirects for {url}"); return None
337
+ except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] Request error scraping {url}: {e}"); return None
338
+ except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing {url}: {e}", exc_info=True); return None
339
 
340
+ # Website Content via URLToText API
 
 
 
 
 
 
 
 
 
 
341
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
342
+ """Fetches website content using the URLToText API."""
343
  if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
344
  if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
345
+ logger.info(f"[Web Scraper - URLToText API] Attempting fetch for: {url}")
 
346
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
347
+ payload = json.dumps({"url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False})
348
+ headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
 
 
 
 
 
 
 
 
 
 
 
349
  try:
350
  logger.debug(f"[Web Scraper - URLToText API] Sending POST request for {url}")
351
+ response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45)
 
352
  logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
 
353
  if response.status_code == 200:
354
  try:
355
  data = response.json()
356
+ content = data.get("data", {}).get("content")
357
  credits = data.get("credits_used", "N/A")
358
+ warning = data.get("data", {}).get("warning")
359
+ if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}")
 
 
 
360
  if content:
361
  logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}")
362
  return content.strip()
363
  else:
364
+ logger.warning(f"[Web Scraper - URLToText API] API success (200) but content empty for {url}. Response: {data}")
365
+ return None
366
+ except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed JSON decode for {url}. Status: {response.status_code}. Resp: {response.text[:500]}..."); return None
367
+ except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True); return None
368
+ elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400) for {url}. Resp: {response.text[:200]}...")
369
+ elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401) for {url}. Check Key. Resp: {response.text[:200]}...")
370
+ elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402) for {url}. Check credits. Resp: {response.text[:200]}...")
371
+ elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL (422) for {url}. Resp: {response.text[:200]}...")
372
+ elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] Server Error ({response.status_code}) from API for {url}. Resp: {response.text[:200]}...")
373
+ else: logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Resp: {response.text[:200]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  return None
375
+ except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout error connecting for {url}"); return None
376
+ except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting for {url}: {e}"); return None
377
+ except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True); return None
378
 
379
+ # DeepSeek Summary Function
380
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
381
  """Generates summary using DeepSeek via OpenRouter API."""
382
+ logger.info(f"Generating '{summary_type}' summary. Input length: {len(text)}")
383
+ if not api_key: logger.error("OpenRouter API key missing."); return "Error: AI config key missing."
384
+ if not text: logger.warning("generate_summary called with empty text."); return "Error: No content to summarize."
 
 
 
 
 
385
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
 
386
  model_name = "deepseek/deepseek-chat:free"
387
+ if summary_type == "paragraph": prompt = "Please provide a concise, well-written paragraph summarizing the key information and main points of the following text. Focus on capturing the essence of the content accurately."
388
+ elif summary_type == "points": prompt = "Please summarize the following text into clear, distinct bullet points. Each point should highlight a key piece of information, finding, or main topic discussed. Aim for clarity and conciseness."
389
+ else: logger.error(f"Invalid summary_type '{summary_type}'."); return f"Error: Invalid summary type ('{summary_type}')."
390
+ MAX_INPUT_LENGTH = 500000
391
+ if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input text ({len(text)}) > limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Truncated)"
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  full_prompt = f"{prompt}\n\n--- Start of Text ---\n\n{text}\n\n--- End of Text ---"
393
+ # Try getting space host for referer header
394
+ space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME") # Default if env var not set
395
+ referer_url = f"https://{space_host}" if not space_host.startswith("http") else space_host
396
+ headers = {"Authorization": f"Bearer {api_key}","Content-Type": "application/json","HTTP-Referer": referer_url,"X-Title": "Telegram URL Summarizer Bot"}
397
+ payload = json.dumps({"model": model_name, "messages": [{"role": "user", "content": full_prompt}]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  try:
399
  logger.debug(f"Sending request to OpenRouter (Model: {model_name})...")
400
+ response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=90)
 
 
 
 
 
 
401
  logger.debug(f"Received status code {response.status_code} from OpenRouter.")
 
402
  if response.status_code == 200:
403
  try:
404
  data = response.json()
 
405
  if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
406
  message = data["choices"][0].get("message")
407
  if message and message.get("content"):
408
  summary = message["content"].strip()
409
+ if summary: logger.info(f"Success generating summary. Len: {len(summary)}"); return summary
410
+ else: logger.warning(f"OpenRouter success but empty content. Resp: {data}"); return "Sorry, AI model returned empty summary."
411
+ else: logger.warning(f"OpenRouter success but missing content field. Resp: {data}"); return "Sorry, could not parse AI response (content)."
412
+ elif data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}"); return f"Sorry, AI service error: {data['error'].get('message', 'Unknown')}"
413
+ else: logger.error(f"Unexpected OpenRouter choices structure. Resp: {data}"); return "Sorry, could not parse AI response (choices)."
414
+ except json.JSONDecodeError: logger.error(f"Failed JSON decode from OpenRouter. Status: {response.status_code}. Resp: {response.text[:500]}..."); return "Sorry, failed to understand AI response format."
415
+ except Exception as e: logger.error(f"Error processing OpenRouter success resp: {e}", exc_info=True); return "Sorry, error processing AI response."
416
+ elif response.status_code == 401: logger.error("OpenRouter API key invalid (401)."); return "Error: AI model config key invalid."
417
+ elif response.status_code == 402: logger.error("OpenRouter Payment Required (402)."); return "Sorry, issue with AI service limits/payment."
418
+ elif response.status_code == 429: logger.warning("OpenRouter Rate Limit (429)."); return "Sorry, AI model busy. Try again."
419
+ elif response.status_code >= 500: logger.error(f"OpenRouter Internal Error ({response.status_code}). Resp: {response.text[:500]}..."); return "Sorry, AI model service error. Try again later."
420
+ else:
421
+ logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp: {response.text[:500]}...")
422
+ try: error_data = response.json(); error_msg = error_data.get("error", {}).get("message", response.text[:100]); return f"Sorry, AI service error ({response.status_code}): {error_msg}"
423
+ except: return f"Sorry, AI service returned status {response.status_code}."
424
+ except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter."); return "Sorry, request to AI model timed out."
425
+ except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter: {e}"); return "Sorry, error connecting to AI model service."
426
+ except Exception as e: logger.error(f"Unexpected error in generate_summary: {e}", exc_info=True); return "Sorry, unexpected error generating summary."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
 
429
  # --- Telegram Bot Handlers ---
 
431
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
432
  """Handles the /start command."""
433
  user = update.effective_user
434
+ if not user: return
435
  logger.info(f"User {user.id} ({user.username or 'NoUsername'}) triggered /start.")
 
436
  mention = user.mention_html() if user.username else user.first_name
437
+ await update.message.reply_html(f"👋 Hello {mention}! Send a YouTube or website URL to summarize.")
 
 
 
438
 
439
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
440
  """Handles the /help command."""
441
  user = update.effective_user
442
  logger.info(f"User {user.id if user else 'Unknown'} triggered /help.")
443
+ help_text = ( "**How I Work:**\n\n" # ... (keep your detailed help text) ...
444
+ "1. Send URL.\n2. Choose format (Paragraph/Points).\n3. Get summary!\n\n"
445
+ "**Troubleshooting:**\n- YT transcripts might fail.\n- Complex websites hard to scrape.\n- AI errors possible.\n\nSend link to start!" )
 
 
 
 
 
 
 
 
 
 
446
  await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
447
 
448
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
449
  """Handles messages containing potential URLs."""
450
+ if not update.message or not update.message.text: return
451
+ message_text = update.message.text.strip(); user = update.effective_user;
 
 
 
452
  if not user: return
453
+ url_pattern = r'https?://[^\s/$.?#].[^\s]*'; match = re.search(url_pattern, message_text)
 
 
 
 
 
454
  if match:
455
+ url = match.group(0); logger.info(f"User {user.id} sent potential URL: {url}")
456
+ context.user_data['url_to_summarize'] = url; logger.debug(f"Stored URL '{url}' for user {user.id}")
457
+ keyboard = [[ InlineKeyboardButton("Paragraph", callback_data="paragraph"), InlineKeyboardButton("Points", callback_data="points")]]
 
 
 
 
 
 
 
 
 
 
 
458
  reply_markup = InlineKeyboardMarkup(keyboard)
459
+ await update.message.reply_text(f"Link detected:\n{url}\n\nChoose summary type:", reply_markup=reply_markup, link_preview_options={'is_disabled': True})
460
+ else: logger.debug(f"Ignoring non-URL message from {user.id}: {message_text[:100]}")
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
463
  """Handles button presses for choosing the summary type."""
464
+ query = update.callback_query; user = query.from_user
465
+ try: await query.answer(); logger.debug(f"Answered callback query {query.id}")
466
+ except Exception as e: logger.error(f"Failed to answer callback query {query.id}: {e}")
467
+ summary_type = query.data; url = context.user_data.get('url_to_summarize')
468
+ logger.info(f"User {user.id} chose '{summary_type}'. URL in context: '{url}'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  if not url:
470
+ logger.warning(f"User {user.id} pressed button, NO URL in context.");
471
+ try: await query.edit_message_text(text="Context lost. Send link again.")
472
+ except Exception as edit_err: logger.error(f"Failed edit on lost context: {edit_err}")
473
+ return
474
+ context.user_data.pop('url_to_summarize', None); logger.debug(f"Cleared URL {url} for user {user.id}")
475
+ current_openrouter_key = os.environ.get('OPENROUTER_API_KEY'); current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
476
+ current_supadata_key = os.environ.get('SUPADATA_API_KEY'); current_apify_token = os.environ.get('APIFY_API_TOKEN')
477
+ logger.debug(f"Keys read: OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  if not current_openrouter_key:
479
+ logger.error("OpenRouter key missing.");
480
+ try: await query.edit_message_text(text="⚠️ AI service config error (key missing). Contact admin.")
481
+ except Exception as edit_err: logger.error(f"Failed edit on missing OR key: {edit_err}")
 
 
482
  return
483
+ processing_message_text = f"Working on '{summary_type}' summary for:\n{url}\n..."; message_to_delete_later_id = None
484
+ try: await query.edit_message_text(text=processing_message_text); logger.debug(f"Edited message query {query.id}")
 
 
 
 
 
 
485
  except Exception as e:
486
+ logger.warning(f"Could not edit message {query.message.message_id if query.message else 'N/A'}: {e}. Sending new.");
487
+ try: status_message = await context.bot.send_message(chat_id=user.id, text=processing_message_text); message_to_delete_later_id = status_message.message_id; logger.debug(f"Sent new status message {message_to_delete_later_id}")
488
+ except Exception as send_err: logger.error(f"Failed sending new status message: {send_err}")
489
+ content = None; user_feedback_message = None; success = False
 
 
 
 
 
 
 
 
 
 
 
490
  try:
491
+ logger.debug(f"Sending 'typing' action for chat {user.id}"); await context.bot.send_chat_action(chat_id=user.id, action='typing')
492
+ is_yt = is_youtube_url(url); logger.debug(f"URL is YouTube: {is_yt}")
 
 
 
 
 
493
  if is_yt:
494
  video_id = extract_youtube_id(url)
495
  if video_id:
496
+ logger.info(f"Fetching YT transcript: {video_id}"); content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
497
+ if not content: user_feedback_message = "Sorry, couldn't get YT transcript (unavailable/private/no captions?)."
498
+ logger.info(f"YT transcript fetch done. Found: {bool(content)}")
499
+ else: logger.warning(f"Failed YT ID extraction: {url}"); user_feedback_message = "Sorry, couldn't parse YT video ID."
 
 
 
 
 
500
  else:
501
+ logger.info(f"Scraping website (Requests/BS4): {url}"); content = await get_website_content_via_requests(url)
502
+ if content: logger.info("Website scrape (Requests/BS4) OK."); user_feedback_message = None
 
 
 
 
 
503
  else:
504
+ logger.warning(f"Website scrape failed for {url}. Trying URLToText API.");
505
  if current_urltotext_key:
506
+ await context.bot.send_chat_action(chat_id=user.id, action='typing'); content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
507
+ if content: logger.info("URLToText API scrape OK."); user_feedback_message = None
508
+ else: logger.warning(f"URLToText scrape failed for {url}."); user_feedback_message = "Sorry, couldn't fetch web content (both methods)."
509
+ else: logger.warning("URLToText key not configured."); user_feedback_message = "Sorry, couldn't fetch web content (fallback not configured)."
 
 
 
 
 
 
 
 
 
 
510
  if content:
511
+ logger.info("Content found, generating summary."); await context.bot.send_chat_action(chat_id=user.id, action='typing')
 
 
 
512
  summary = await generate_summary(content, summary_type, current_openrouter_key)
513
+ if summary.startswith("Error:") or summary.startswith("Sorry,"): user_feedback_message = summary; logger.warning(f"Summary generation failed: {summary}")
514
+ else: logger.info("Summary generated OK. Sending."); await context.bot.send_message(chat_id=user.id, text=summary, parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True}); success = True; user_feedback_message = None
515
+ elif not user_feedback_message: user_feedback_message = "Sorry, couldn't retrieve content from link."
516
+ if user_feedback_message and not success: logger.warning(f"Sending failure feedback: {user_feedback_message}"); await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
517
+ except Exception as e: logger.error(f"Unexpected error in callback processing: {e}", exc_info=True);
518
+ try: await context.bot.send_message(chat_id=user.id, text="Oops! Internal error processing request.")
519
+ except Exception as final_err: logger.error(f"Failed sending final error message: {final_err}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  finally:
521
+ logger.debug("Cleaning up status message...");
 
 
522
  try:
523
+ if message_to_delete_later_id: await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later_id); logger.debug("Deleted separate status msg.")
524
+ elif query.message: await query.delete_message(); logger.debug(f"Deleted original message query {query.id}.")
525
+ except Exception as del_e: logger.warning(f"Could not delete status/button message: {del_e}")
 
 
 
 
 
 
 
 
 
 
 
526
 
527
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
528
  """Log Errors caused by Updates."""
529
  logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
530
 
531
+ # --- Bot Setup Function (Configure Only) ---
532
+ async def setup_bot_config() -> Application:
533
+ """Configures the PTB Application but does NOT initialize or start it."""
534
+ logger.info("Configuring Telegram Application...")
 
 
 
 
 
 
 
 
 
 
 
 
535
  if not TELEGRAM_TOKEN:
536
+ logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
537
+ raise ValueError("TELEGRAM_TOKEN environment variable not set.")
 
 
538
  application = Application.builder().token(TELEGRAM_TOKEN).build()
 
 
 
539
  application.add_handler(CommandHandler("start", start))
540
  application.add_handler(CommandHandler("help", help_command))
 
 
 
 
541
  application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
 
 
542
  application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
 
 
543
  application.add_error_handler(error_handler)
544
+ logger.info("Telegram handlers configured.")
 
 
 
 
 
 
 
 
 
 
 
 
545
  return application
546
 
547
+ # --- ASGI Lifespan Context Manager ---
548
+ @contextlib.asynccontextmanager
549
+ async def lifespan(app: Starlette): # app argument is the Starlette instance
550
+ """Handles PTB startup and shutdown during ASGI lifespan."""
551
+ global ptb_app
552
+ logger.info("ASGI Lifespan: Startup commencing...")
553
+ loop = asyncio.get_running_loop()
 
554
 
555
+ try:
556
+ ptb_app = await setup_bot_config()
557
+ logger.info("PTB App configured. Initializing...")
558
+ await ptb_app.initialize()
559
+ logger.info("PTB App initialized. Starting background tasks...")
560
+ await ptb_app.start()
561
+ logger.info(f"PTB App started. Bot details: {ptb_app.bot.username}")
562
+
563
+ # Set webhook after start
564
+ WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
565
+ if WEBHOOK_URL_BASE:
566
+ if not WEBHOOK_URL_BASE.startswith("https://"): WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
567
+ webhook_path = "/webhook" # Matches Flask route
568
+ full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
569
+ logger.info(f"Attempting to set webhook to: {full_webhook_url}")
570
+ try:
571
+ await ptb_app.bot.set_webhook(url=full_webhook_url, allowed_updates=Update.ALL_TYPES)
572
+ webhook_info = await ptb_app.bot.get_webhook_info()
573
+ logger.info(f"Webhook set successfully! Info: {webhook_info}")
574
+ except Exception as e: logger.error(f"Failed to set webhook: {e}", exc_info=True)
575
+ else: logger.warning("SPACE_HOST env variable not found. Skipping webhook setup.")
576
+
577
+ logger.info("ASGI Lifespan: Startup complete. Application ready.")
578
+ yield # Application runs here
579
+
580
+ except Exception as startup_err:
581
+ logger.critical(f"CRITICAL ERROR during ASGI startup: {startup_err}", exc_info=True)
582
+ # Optionally re-raise or handle to prevent server from potentially running in bad state
583
+ raise
584
+ finally:
585
+ # --- Shutdown ---
586
+ logger.info("ASGI Lifespan: Shutdown commencing...")
587
+ if ptb_app and ptb_app.is_running:
588
+ try:
589
+ logger.info("Stopping PTB App...")
590
+ await ptb_app.stop()
591
+ logger.info("Shutting down PTB App...")
592
+ await ptb_app.shutdown()
593
+ logger.info("PTB App shut down successfully.")
594
+ except Exception as shutdown_err:
595
+ logger.error(f"Error during PTB shutdown: {shutdown_err}", exc_info=True)
596
+ elif ptb_app:
597
+ logger.warning("PTB App instance exists but was not running at shutdown.")
598
+ else:
599
+ logger.warning("No PTB App instance found at shutdown.")
600
+ logger.info("ASGI Lifespan: Shutdown complete.")
601
 
 
 
 
 
 
602
 
603
+ # --- Flask App Setup (for Routes) ---
604
+ flask_core_app = Flask(__name__)
605
+ logger.info("Core Flask app instance created (for routing via Starlette).")
606
 
607
+ # --- Define Flask Routes on flask_core_app ---
608
+ @flask_core_app.route('/')
609
  def index():
610
  """Basic health check endpoint."""
611
  logger.debug("Health check '/' accessed.")
612
+ bot_status = "UNKNOWN"
613
+ if ptb_app: bot_status = "Running" if ptb_app.is_running else "Initialized/Stopped/Starting/Error"
614
+ else: bot_status = "Not Initialized"
615
+ return f"Telegram Bot Webhook Listener ({bot_status}) running via Starlette."
616
 
617
+ @flask_core_app.route('/webhook', methods=['POST'])
618
  async def webhook() -> Response:
619
  """Webhook endpoint for Telegram updates."""
620
+ if not ptb_app or not ptb_app.is_running:
621
+ status = "Not Initialized" if not ptb_app else "Not Running"
622
+ logger.error(f"Webhook triggered, but PTB Application is {status}.")
623
+ return Response('Bot service not ready.', status=503)
624
 
625
  logger.debug("Webhook request received (POST)...")
626
  if request.is_json:
 
628
  update_data = request.get_json()
629
  update = Update.de_json(update_data, ptb_app.bot)
630
  logger.debug(f"Processing update ID: {update.update_id} via webhook")
631
+ await ptb_app.process_update(update) # Queue/process the update
632
  logger.debug(f"Finished processing update ID: {update.update_id}")
633
+ return Response('ok', status=200)
634
+ except json.JSONDecodeError: logger.error("Failed JSON decode from Telegram."); return Response('Bad Request: Invalid JSON', status=400)
635
+ except Exception as e: logger.error(f"Error processing update in webhook handler: {e}", exc_info=True); return Response('Internal Server Error processing update.', status=500)
636
+ else: logger.warning("Received non-JSON request to webhook."); return Response('Bad Request: Expected JSON', status=400)
637
+
638
+
639
+ # --- Create Starlette App with Lifespan & Mount Flask ---
640
+ app = Starlette(
641
+ lifespan=lifespan,
642
+ routes=[
643
+ Mount("/", app=WSGIMiddleware(flask_core_app))
644
+ ]
645
+ )
646
+ logger.info("Starlette application created with lifespan and Flask app mounted at '/'.")
 
 
647
 
648
 
649
  # --- Main Execution Block (for local testing ONLY) ---
650
  if __name__ == '__main__':
651
+ logger.warning("Running Flask development server directly (LOCAL TESTING ONLY).")
652
+ logger.warning("NOTE: This mode does NOT initialize PTB via ASGI lifespan.")
653
+ logger.warning("Use 'uvicorn main:app --reload --port 8080' for proper local ASGI testing.")
654
+ if not TELEGRAM_TOKEN: logger.critical("Aborting local Flask start: TELEGRAM_TOKEN missing.")
 
655
  else:
656
  local_port = int(os.environ.get('PORT', 8080))
657
  logger.info(f"Flask dev server starting on http://0.0.0.0:{local_port}")
658
+ flask_core_app.run(host='0.0.0.0', port=local_port, debug=True)