fmab777 commited on
Commit
073dd22
·
verified ·
1 Parent(s): 1aef3b0

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +682 -183
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Revised: Apify 201 fix + Supadata verify=False TEST)
2
  import os
3
  import re
4
  import logging
@@ -14,7 +14,7 @@ from telegram.ext import (
14
  filters,
15
  ContextTypes,
16
  CallbackQueryHandler,
17
- ApplicationBuilder # Import ApplicationBuilder
18
  )
19
  from telegram.constants import ParseMode # Import ParseMode explicitly
20
 
@@ -27,21 +27,23 @@ _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
27
  if _apify_token_exists:
28
  from apify_client import ApifyClient
29
  else:
30
- ApifyClient = None
31
 
32
  # --- Logging Setup ---
33
  logging.basicConfig(
34
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
35
- level=logging.DEBUG # Keep DEBUG
36
  )
 
37
  logging.getLogger("httpx").setLevel(logging.WARNING)
38
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
39
- logging.getLogger("telegram.ext").setLevel(logging.DEBUG)
40
- logging.getLogger('telegram.bot').setLevel(logging.DEBUG)
41
- logging.getLogger("urllib3").setLevel(logging.INFO) # Reduce requests noise slightly
42
- logging.getLogger('gunicorn.error').setLevel(logging.INFO)
 
43
  logger = logging.getLogger(__name__)
44
- logger.info("Logging configured (DEBUG level).")
45
 
46
  # --- Environment Variable Loading ---
47
  logger.info("Attempting to load secrets from environment variables...")
@@ -65,16 +67,23 @@ logger.info("Secret loading attempt finished.")
65
  def is_youtube_url(url):
66
  """Checks if the URL is a valid YouTube video or shorts URL."""
67
  youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
68
- return bool(re.search(youtube_regex, url))
 
 
 
69
 
70
  def extract_youtube_id(url):
71
  """Extracts the YouTube video ID from a URL."""
72
- youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
 
73
  match = re.search(youtube_id_regex, url)
74
  if match:
75
- return match.group(1)
76
- logger.warning(f"Could not extract YouTube ID from URL: {url}")
77
- return None
 
 
 
78
 
79
  # Supadata Transcript Fetching
80
  async def get_transcript_via_supadata(video_id: str, api_key: str):
@@ -86,16 +95,16 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
86
  params = {"videoId": video_id, "format": "text"}
87
  headers = {"X-API-Key": api_key}
88
  try:
89
- # ---!!! INSECURE TEST - DISABLES SSL VERIFICATION !!!---
90
- logger.warning("[Supadata] Making request with verify=False (INSECURE TEST)")
91
  response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
92
- # ---!!! END INSECURE TEST --- (Remember to remove verify=False later) ---
93
 
94
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
95
  if response.status_code == 200:
96
- # (Rest of the success handling code remains the same)
97
  try:
98
  data = response.json()
 
99
  content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
100
  if content and isinstance(content, str):
101
  logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
@@ -104,6 +113,7 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
104
  logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
105
  return None
106
  except json.JSONDecodeError:
 
107
  if response.text:
108
  logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
109
  return response.text.strip()
@@ -127,9 +137,9 @@ async def get_transcript_via_supadata(video_id: str, api_key: str):
127
  return None
128
  except requests.exceptions.RequestException as e:
129
  logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
130
- # Log the specific SSLError if verify=False wasn't the only issue
131
  if isinstance(e, requests.exceptions.SSLError):
132
- logger.error(f"[Supadata] SSL Error details: {e}")
133
  return None
134
  except Exception as e:
135
  logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
@@ -144,12 +154,13 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
144
 
145
  logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
146
  actor_id = "karamelo~youtube-transcripts"
 
147
  api_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
148
  params = {"token": api_token}
149
  payload = json.dumps({
150
  "urls": [video_url],
151
- "outputFormat": "singleStringText",
152
- "maxRetries": 5,
153
  "channelHandleBoolean": False,
154
  "channelNameBoolean": False,
155
  "datePublishedBoolean": False,
@@ -158,28 +169,36 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
158
  headers = {"Content-Type": "application/json"}
159
  try:
160
  logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
161
- response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90)
 
162
  logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
163
 
164
- # --- MODIFIED STATUS CODE CHECK ---
165
- if response.status_code in [200, 201]: # Accept 200 OK or 201 Created
 
166
  # --- END MODIFIED STATUS CODE CHECK ---
167
  try:
168
  results = response.json()
 
169
  if isinstance(results, list) and len(results) > 0:
170
- item = results[0]
 
171
  content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
 
 
 
172
  if not content and item.get("captions") and isinstance(item["captions"], list):
173
- logger.info("[Apify] Processing 'captions' format.")
174
  content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
 
175
  if content and isinstance(content, str):
176
- logger.info(f"[Apify] Successfully fetched transcript for {video_url} (Status: {response.status_code}). Length: {len(content)}")
177
  return content.strip()
178
  else:
179
- logger.warning(f"[Apify] Actor run successful ({response.status_code}) but content not found/empty for {video_url}. Item: {item}")
180
  return None
181
  else:
182
- logger.warning(f"[Apify] Actor run successful ({response.status_code}) but dataset empty for {video_url}. Response: {results}")
183
  return None
184
  except json.JSONDecodeError:
185
  logger.error(f"[Apify] Failed to decode JSON response for {video_url}. Status: {response.status_code}. Resp: {response.text[:200]}...")
@@ -187,15 +206,18 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
187
  except Exception as e:
188
  logger.error(f"[Apify] Error processing successful response ({response.status_code}) for {video_url}: {e}", exc_info=True)
189
  return None
 
190
  elif response.status_code == 400:
191
  logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Response: {response.text[:200]}...")
192
  return None
193
  elif response.status_code == 401:
194
  logger.error("[Apify] Authentication error (401). Check API token.")
195
  return None
196
- else: # Catch other non-200/201 codes here
197
- logger.error(f"[Apify] Unexpected status code {response.status_code} for {video_url}. Response: {response.text[:200]}...")
 
198
  return None
 
199
  except requests.exceptions.Timeout:
200
  logger.error(f"[Apify] Timeout error running actor for {video_url}")
201
  return None
@@ -206,12 +228,6 @@ async def get_transcript_via_apify(video_url: str, api_token: str):
206
  logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True)
207
  return None
208
 
209
- # (The rest of the functions: get_youtube_transcript, get_website_content_via_requests,
210
- # get_website_content_via_urltotext_api, generate_summary, start, help_command,
211
- # handle_potential_url, handle_summary_type_callback, error_handler, setup_bot,
212
- # webhook, index, and the main execution block remain EXACTLY THE SAME as in the
213
- # previous complete code block. Ensure they are included below this point.)
214
-
215
  # Combined YouTube Transcript Function (with Fallbacks)
216
  async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
217
  """Fetches YouTube transcript using library, then Supadata, then Apify."""
@@ -222,52 +238,63 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
222
  # 1. Primary Method: youtube-transcript-api
223
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
224
  try:
 
225
  transcript_list = await asyncio.to_thread(
226
  YouTubeTranscriptApi.get_transcript,
227
  video_id,
228
- languages=['en', 'en-GB', 'en-US']
229
  )
230
  if transcript_list:
231
  transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
 
 
232
  if transcript_text:
233
  logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
234
  return transcript_text
235
  else:
236
- logger.warning(f"[Primary YT] Joined transcript text is empty for {video_id}")
237
- transcript_text = None
238
  else:
239
- logger.warning(f"[Primary YT] Transcript list empty for {video_id}")
240
  transcript_text = None
241
  except Exception as e:
242
- logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {e}")
243
- if "YouTube is blocking requests" in str(e): logger.warning("[Primary YT] IP likely blocked by YouTube.")
244
- elif "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found for {video_id}.")
245
- elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled for {video_id}.")
 
 
 
 
 
 
246
  transcript_text = None
247
 
248
  # 2. Fallback 1: Supadata API
249
  if transcript_text is None:
250
- logger.info("[Fallback YT 1] Primary method failed. Trying Supadata API...")
251
  if supadata_key:
252
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
253
  if transcript_text:
254
- logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id}")
255
- return transcript_text
256
  else:
257
  logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
 
258
  else:
259
  logger.warning("[Fallback YT 1] Supadata API key not available. Skipping.")
260
 
261
  # 3. Fallback 2: Apify API
262
  if transcript_text is None:
263
- logger.info("[Fallback YT 2] Primary & Supadata failed. Trying Apify API...")
264
  if apify_token:
265
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
266
  if transcript_text:
267
- logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url}")
268
- return transcript_text
269
  else:
270
  logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
 
271
  else:
272
  logger.warning("[Fallback YT 2] Apify API token not available. Skipping.")
273
 
@@ -276,6 +303,7 @@ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: st
276
  logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
277
  return None
278
 
 
279
  return transcript_text
280
 
281
  # Website Content via Requests/BS4 (Primary Method for Simplified Bot)
@@ -284,211 +312,682 @@ async def get_website_content_via_requests(url):
284
  if not url: logger.error("[Web Scraper - Requests/BS4] called with no URL"); return None
285
  logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
286
  try:
 
287
  headers = {
288
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
289
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
290
  'Accept-Language': 'en-US,en;q=0.9',
291
- 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
292
- logger.debug(f"[Web Scraper - Requests/BS4] Sending request to {url}")
 
 
 
 
293
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
294
- response.raise_for_status()
295
  logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
 
296
  content_type = response.headers.get('content-type', '').lower()
297
  if 'html' not in content_type:
298
- logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}")
 
 
 
 
299
  return None
 
 
300
  soup = BeautifulSoup(response.text, 'html.parser')
301
- for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]): element.extract()
302
- main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  target_element = main_content if main_content else soup.body
 
304
  if not target_element:
305
  logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}")
306
- return None
 
 
 
 
 
307
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
308
  text = "\n".join(lines)
309
- if not text or len(text) < 50:
310
- logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short or empty after cleaning for {url} (Length: {len(text)})")
 
 
 
 
 
 
311
  logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
312
  return text
313
- except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}"); return None
314
- except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}"); return None
315
- except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}"); return None
316
- except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True); return None
 
 
 
 
 
 
317
 
318
  # Website Content via URLToText API (Fallback Method)
319
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
320
  """Fetches website content using the URLToText API (Fallback)."""
321
  if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
322
  if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
 
323
  logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}")
324
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
325
- payload = json.dumps({ "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False })
326
- headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
 
 
 
 
 
 
 
 
 
 
 
327
  try:
328
- logger.debug(f"[Web Scraper - URLToText API] Sending request for {url}")
329
- response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45)
 
330
  logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
 
331
  if response.status_code == 200:
332
  try:
333
  data = response.json()
334
- content = data.get("data", {}).get("content")
335
  credits = data.get("credits_used", "N/A")
336
- warning = data.get("data", {}).get("warning")
337
- if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}")
338
- if content: logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}"); return content.strip()
339
- else: logger.warning(f"[Web Scraper - URLToText API] API returned success but content was empty for {url}. Response: {data}"); return None
340
- except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Response: {response.text[:500]}..."); return None
341
- except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True); return None
342
- elif response.status_code in [400, 402, 422, 500]: logger.error(f"[Web Scraper - URLToText API] Error {response.status_code} from API for {url}. Response: {response.text[:200]}..."); return None
343
- else: logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}..."); return None
344
- except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}"); return None
345
- except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}"); return None
346
- except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  # DeepSeek Summary Function (via OpenRouter)
349
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
350
  """Generates summary using DeepSeek via OpenRouter API."""
351
- logger.info(f"Generating {summary_type} summary using DeepSeek/OpenRouter. Input text length: {len(text)}")
352
- if not api_key: logger.error("OpenRouter API key was not provided."); return "Error: AI model config key missing."
 
 
 
 
 
 
353
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
 
354
  model_name = "deepseek/deepseek-chat:free"
355
- if summary_type == "paragraph": prompt = "..." # Keep prompt as before
356
- else: prompt = """...""" # Keep prompt as before
357
- MAX_INPUT_LENGTH = 500000
358
- if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input text ({len(text)}) > limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Truncated)"
359
- full_prompt = f"{prompt}\n\n{text}"
360
- headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "HTTP-Referer": "https://huggingface.co/spaces/", "X-Title": "Telegram Summary Bot (HF Space)"}
361
- payload = json.dumps({ "model": model_name, "messages": [{"role": "user", "content": full_prompt}]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  try:
363
- logger.debug(f"Sending request to OpenRouter ({model_name})...")
364
- response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=60)
 
 
 
 
 
 
365
  logger.debug(f"Received status code {response.status_code} from OpenRouter.")
 
366
  if response.status_code == 200:
367
  try:
368
  data = response.json()
369
- if data.get("choices") and data["choices"]:
 
370
  message = data["choices"][0].get("message")
371
  if message and message.get("content"):
372
  summary = message["content"].strip()
373
- logger.info(f"Success generating summary. Len: {len(summary)}")
374
- return summary
375
- else: logger.warning(f"OpenRouter success but empty content. Resp: {data}"); return "Sorry, AI model returned empty summary."
 
 
 
 
 
 
 
 
 
 
 
 
376
  else:
377
- if data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}")
378
- else: logger.error(f"Unexpected choices structure: {data.get('choices')}. Resp: {data}")
379
- return "Sorry, could not parse AI response (choices/error)."
380
- except json.JSONDecodeError: logger.error(f"Failed JSON decode from OpenRouter. Status: {response.status_code}. Resp: {response.text[:500]}..."); return "Sorry, failed to understand AI response."
381
- except Exception as e: logger.error(f"Error processing OpenRouter success resp: {e}", exc_info=True); return "Sorry, error processing AI response."
382
- elif response.status_code == 401: logger.error("OpenRouter API key invalid (401). Check HF Secrets."); return "Error: AI model config key invalid."
383
- elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check OpenRouter account."); return "Sorry, issue with AI service limits/payment."
384
- elif response.status_code == 429: logger.warning("OpenRouter Rate Limit (429)."); return "Sorry, AI model busy. Try again."
385
- elif response.status_code == 500: logger.error(f"OpenRouter Internal Error (500). Resp: {response.text[:500]}..."); return "Sorry, AI model service error. Try again later."
386
- else:
387
- logger.error(f"Unexpected status {response.status_code} from OpenRouter. Resp: {response.text[:500]}...")
388
- try: error_data = response.json(); error_msg = error_data.get("error", {}).get("message", response.text[:100]); return f"Sorry, AI service error ({response.status_code}): {error_msg}"
389
- except: return f"Sorry, AI service returned status {response.status_code}."
390
- except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter."); return "Sorry, request to AI model timed out."
391
- except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter: {e}"); return "Sorry, error connecting to AI model service."
392
- except Exception as e: logger.error(f"Unexpected error in generate_summary: {e}", exc_info=True); return "Sorry, unexpected error generating summary."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  # --- Telegram Bot Handlers ---
 
395
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
396
- user = update.effective_user; logger.info(f"User {user.id} ({user.username or 'NoUsername'}) used /start.")
 
 
 
 
397
  mention = user.mention_html() if user.username else user.first_name
398
- await update.message.reply_html(f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\nJust send me a link anytime!")
 
 
 
 
399
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
400
- logger.info(f"User {update.effective_user.id} used /help.")
401
- await update.message.reply_text("...", parse_mode=ParseMode.MARKDOWN) # Keep help text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
403
- if not update.message or not update.message.text: return
404
- url = update.message.text.strip(); user = update.effective_user
405
- logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}")
406
- if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]: logger.debug(f"Ignoring non-URL: {url}"); return
407
- context.user_data['url_to_summarize'] = url; logger.debug(f"Stored URL '{url}' for user {user.id}")
408
- keyboard = [[InlineKeyboardButton("Paragraph", callback_data="paragraph"), InlineKeyboardButton("Points", callback_data="points")]]
409
- reply_markup = InlineKeyboardMarkup(keyboard)
410
- await update.message.reply_text(f"Link detected:\n{url}\n\nChoose summary type:", reply_markup=reply_markup, disable_web_page_preview=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
412
- query = update.callback_query; await query.answer()
413
- summary_type = query.data; user = update.effective_user or query.from_user
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  url = context.user_data.get('url_to_summarize')
415
- logger.info(f"User {user.id} chose '{summary_type}' for URL '{url}'.")
416
- if not url: logger.warning(f"User {user.id} pressed button, NO URL in context."); await query.edit_message_text(text="Context lost. Send link again."); return
417
- context.user_data.pop('url_to_summarize', None); logger.debug(f"Cleared URL {url} for user {user.id}")
418
- current_openrouter_key = os.environ.get('OPENROUTER_API_KEY'); current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
419
- current_supadata_key = os.environ.get('SUPADATA_API_KEY'); current_apify_token = os.environ.get('APIFY_API_TOKEN')
420
- if not current_openrouter_key: logger.error("OpenRouter key missing."); await context.bot.send_message(chat_id=user.id, text="Error: AI config missing."); await query.delete_message(); return
421
- processing_message = f"Working on '{summary_type}' summary for:\n{url}\n..."; message_to_delete_later = None
422
- try: await query.edit_message_text(processing_message); logger.debug(f"Edited message query {query.id}")
423
- except Exception as e: logger.warning(f"Could not edit message {query.id}: {e}. Sending new."); message_to_delete_later = await context.bot.send_message(chat_id=user.id, text=processing_message)
424
- content = None; user_feedback_message = None; success = False; is_youtube = is_youtube_url(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  try:
426
- logger.debug(f"Sending 'typing' action for chat {user.id}"); await context.bot.send_chat_action(chat_id=user.id, action='typing')
427
- if is_youtube:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  video_id = extract_youtube_id(url)
429
  if video_id:
430
- logger.info(f"Fetching YT transcript: {video_id}"); content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
431
- user_feedback_message = None if content else "Sorry, couldn't get YT transcript."
432
- logger.info(f"YT transcript fetch done. Found: {bool(content)}")
433
- else: user_feedback_message = "Sorry, couldn't parse YT URL."
 
 
 
 
 
434
  else:
435
- logger.info(f"Scraping website (Requests/BS4): {url}"); content = await get_website_content_via_requests(url)
436
- if content: logger.info("Website scrape (Requests/BS4) OK."); user_feedback_message = None
 
 
 
 
 
437
  else:
438
- logger.warning(f"Website scrape failed for {url}. Trying URLToText API.");
439
  if current_urltotext_key:
440
- await context.bot.send_chat_action(chat_id=user.id, action='typing'); content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
441
- if content: logger.info("URLToText API scrape OK."); user_feedback_message = None
442
- else: user_feedback_message = "Sorry, couldn't fetch web content (both methods)."
443
- else: user_feedback_message = "Sorry, couldn't fetch web content (fallback not configured)."
 
 
 
 
 
 
 
 
 
 
444
  if content:
445
- logger.info("Content found, generating summary."); await context.bot.send_chat_action(chat_id=user.id, action='typing')
 
 
 
446
  summary = await generate_summary(content, summary_type, current_openrouter_key)
447
- if summary.startswith("Error:") or summary.startswith("Sorry,"): user_feedback_message = summary; logger.warning(f"Summary generation failed: {summary}")
448
- else: logger.info("Summary generated OK. Sending."); await context.bot.send_message(chat_id=user.id, text=summary, parse_mode=ParseMode.MARKDOWN, disable_web_page_preview=True); success = True; user_feedback_message = None
449
- elif not user_feedback_message: user_feedback_message = "Sorry, couldn't retrieve content."
450
- if user_feedback_message and not success: logger.warning(f"Sending failure feedback: {user_feedback_message}"); await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
451
- except Exception as e: logger.error(f"Unexpected error in callback processing: {e}", exc_info=True); await context.bot.send_message(chat_id=user.id, text="Oops! Internal error.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  finally:
453
- logger.debug("Cleaning up status message...");
 
 
454
  try:
455
- if message_to_delete_later: await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later.message_id); logger.debug("Deleted separate status msg.")
456
- elif query: await query.delete_message(); logger.debug(f"Deleted original message query {query.id}.")
457
- except Exception as del_e: logger.warning(f"Could not delete status/button message: {del_e}")
458
- async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: logger.error(f"Exception while handling update: {context.error}", exc_info=context.error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
  # --- Bot Application Setup Function ---
 
461
  async def setup_bot():
462
- logger.info("Setting up Telegram Application...");
463
- if not TELEGRAM_TOKEN: logger.critical("Cannot initialize: TELEGRAM_TOKEN missing."); return None
 
 
 
 
 
464
  application = Application.builder().token(TELEGRAM_TOKEN).build()
465
- logger.info("Running application.initialize()..."); await application.initialize(); logger.info("Finished application.initialize().")
466
- application.add_handler(CommandHandler("start", start)); application.add_handler(CommandHandler("help", help_command))
467
- application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url)); application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
468
- application.add_error_handler(error_handler); logger.info("Telegram handlers registered.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  return application
470
- logger.info("Running bot setup..."); ptb_app = asyncio.run(setup_bot()); logger.info(f"Bot setup finished. App instance: {'OK' if ptb_app else 'Failed'}")
471
 
472
- # --- Flask App Setup ---
473
- app = Flask(__name__); logger.info("Flask app created.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
- # --- Webhook Endpoint ---
476
  @app.route('/webhook', methods=['POST'])
477
  async def webhook() -> Response:
478
- logger.info("Webhook request received...")
479
- if not ptb_app: logger.error("PTB App not initialized."); return Response('Bot not configured', status=500)
 
 
 
 
480
  if request.is_json:
481
  try:
482
- update = Update.de_json(request.get_json(), ptb_app.bot); logger.debug(f"Processing update ID: {update.update_id}")
483
- logger.debug("Directly awaiting process_update..."); await ptb_app.process_update(update); logger.debug("Finished awaiting process_update.")
484
- return Response('ok', status=200)
485
- except json.JSONDecodeError: logger.error("Failed JSON decode from Telegram."); return Response('Bad Request', status=400)
486
- except Exception as e: logger.error(f"Error processing update in webhook: {e}", exc_info=True); return Response('Internal Server Error', status=500)
487
- else: logger.warning("Received non-JSON request to webhook."); return Response('Bad Request', status=400)
488
- @app.route('/')
489
- def index(): logger.debug("Health check '/' accessed."); bot_status = "Initialized" if ptb_app else "FAILED Init"; return f"TG Bot Webhook Listener ({bot_status}) running."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
 
491
- # --- Main Execution Block ---
492
  if __name__ == '__main__':
493
- if not ptb_app: logger.critical("Aborting local Flask start: PTB App failed init.")
494
- else: logger.info("Starting Flask server directly (local testing?)..."); port = int(os.environ.get('PORT', 5000)); app.run(host='0.0.0.0', port=port, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py (Revised: Apify 201 fix + Supadata verify=False + Gunicorn/Uvicorn compatible)
2
  import os
3
  import re
4
  import logging
 
14
  filters,
15
  ContextTypes,
16
  CallbackQueryHandler,
17
+ # ApplicationBuilder is implicitly used by Application.builder()
18
  )
19
  from telegram.constants import ParseMode # Import ParseMode explicitly
20
 
 
27
  if _apify_token_exists:
28
  from apify_client import ApifyClient
29
  else:
30
+ ApifyClient = None # Explicitly set to None if not imported
31
 
32
  # --- Logging Setup ---
33
  logging.basicConfig(
34
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
35
+ level=logging.DEBUG # Keep DEBUG for detailed info
36
  )
37
+ # Reduce noise from libraries
38
  logging.getLogger("httpx").setLevel(logging.WARNING)
39
  if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING)
40
+ logging.getLogger("telegram.ext").setLevel(logging.INFO) # INFO is usually enough for PTB ext
41
+ logging.getLogger('telegram.bot').setLevel(logging.INFO) # INFO is usually enough for PTB bot API calls
42
+ logging.getLogger("urllib3").setLevel(logging.INFO)
43
+ logging.getLogger('gunicorn.error').setLevel(logging.INFO) # Gunicorn logs
44
+ logging.getLogger('uvicorn.error').setLevel(logging.INFO) # Uvicorn logs
45
  logger = logging.getLogger(__name__)
46
+ logger.info("Logging configured (Main logger: DEBUG).")
47
 
48
  # --- Environment Variable Loading ---
49
  logger.info("Attempting to load secrets from environment variables...")
 
67
  def is_youtube_url(url):
68
  """Checks if the URL is a valid YouTube video or shorts URL."""
69
  youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
70
+ # Added check for common query params like si= or feature=
71
+ match = re.search(youtube_regex, url)
72
+ logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}")
73
+ return bool(match)
74
 
75
  def extract_youtube_id(url):
76
  """Extracts the YouTube video ID from a URL."""
77
+ # Updated regex to better handle query parameters after the ID
78
+ youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})(?:\?|&|\s|$)'
79
  match = re.search(youtube_id_regex, url)
80
  if match:
81
+ video_id = match.group(1)
82
+ logger.debug(f"Extracted YouTube ID '{video_id}' from URL: {url}")
83
+ return video_id
84
+ else:
85
+ logger.warning(f"Could not extract YouTube ID from URL: {url}")
86
+ return None
87
 
88
  # Supadata Transcript Fetching
89
  async def get_transcript_via_supadata(video_id: str, api_key: str):
 
95
  params = {"videoId": video_id, "format": "text"}
96
  headers = {"X-API-Key": api_key}
97
  try:
98
+ # --- Keep verify=False for testing, but log clearly ---
99
+ logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification)")
100
  response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
101
+ # --- End verify=False section ---
102
 
103
  logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
104
  if response.status_code == 200:
 
105
  try:
106
  data = response.json()
107
+ # Handle both direct string response and JSON object response
108
  content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
109
  if content and isinstance(content, str):
110
  logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
 
113
  logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
114
  return None
115
  except json.JSONDecodeError:
116
+ # If JSON fails, maybe it's plain text?
117
  if response.text:
118
  logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
119
  return response.text.strip()
 
137
  return None
138
  except requests.exceptions.RequestException as e:
139
  logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
140
+ # Log specific SSL Error details even with verify=False
141
  if isinstance(e, requests.exceptions.SSLError):
142
+ logger.error(f"[Supadata] SSL Error occurred despite using verify=False. Details: {e}")
143
  return None
144
  except Exception as e:
145
  logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
 
154
 
155
  logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
156
  actor_id = "karamelo~youtube-transcripts"
157
+ # Using the run-sync-get-dataset-items endpoint for direct results
158
  api_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
159
  params = {"token": api_token}
160
  payload = json.dumps({
161
  "urls": [video_url],
162
+ "outputFormat": "singleStringText", # Requesting single string directly
163
+ "maxRetries": 3, # Reduced retries slightly
164
  "channelHandleBoolean": False,
165
  "channelNameBoolean": False,
166
  "datePublishedBoolean": False,
 
169
  headers = {"Content-Type": "application/json"}
170
  try:
171
  logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
172
+ # Use asyncio.to_thread for the blocking requests call
173
+ response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90) # 90s timeout for potentially long runs
174
  logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
175
 
176
+ # --- !!! MODIFIED STATUS CODE CHECK !!! ---
177
+ # Accept 200 OK or 201 Created as success for this endpoint
178
+ if response.status_code in [200, 201]:
179
  # --- END MODIFIED STATUS CODE CHECK ---
180
  try:
181
  results = response.json()
182
+ # The run-sync-get-dataset-items endpoint returns a list of dataset items directly
183
  if isinstance(results, list) and len(results) > 0:
184
+ item = results[0] # Get the first (and likely only) item
185
+ # Look for the transcript text under common keys
186
  content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
187
+
188
+ # Additional fallback: Check if 'captions' list exists and join text from it
189
+ # This handles cases where singleStringText might fail but raw captions are present
190
  if not content and item.get("captions") and isinstance(item["captions"], list):
191
+ logger.info("[Apify] Processing 'captions' list format as fallback.")
192
  content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
193
+
194
  if content and isinstance(content, str):
195
+ logger.info(f"[Apify] Successfully fetched transcript via run-sync for {video_url} (Status: {response.status_code}). Length: {len(content)}")
196
  return content.strip()
197
  else:
198
+ logger.warning(f"[Apify] Actor run successful ({response.status_code}) but content not found/empty in result item for {video_url}. Item: {item}")
199
  return None
200
  else:
201
+ logger.warning(f"[Apify] Actor run successful ({response.status_code}) but dataset result list empty for {video_url}. Response: {results}")
202
  return None
203
  except json.JSONDecodeError:
204
  logger.error(f"[Apify] Failed to decode JSON response for {video_url}. Status: {response.status_code}. Resp: {response.text[:200]}...")
 
206
  except Exception as e:
207
  logger.error(f"[Apify] Error processing successful response ({response.status_code}) for {video_url}: {e}", exc_info=True)
208
  return None
209
+ # Handle specific error codes
210
  elif response.status_code == 400:
211
  logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Response: {response.text[:200]}...")
212
  return None
213
  elif response.status_code == 401:
214
  logger.error("[Apify] Authentication error (401). Check API token.")
215
  return None
216
+ # Catch other non-200/201 codes here
217
+ else:
218
+ logger.error(f"[Apify] Unexpected status code {response.status_code} from Apify API for {video_url}. Response: {response.text[:200]}...")
219
  return None
220
+
221
  except requests.exceptions.Timeout:
222
  logger.error(f"[Apify] Timeout error running actor for {video_url}")
223
  return None
 
228
  logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True)
229
  return None
230
 
 
 
 
 
 
 
231
  # Combined YouTube Transcript Function (with Fallbacks)
232
  async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
233
  """Fetches YouTube transcript using library, then Supadata, then Apify."""
 
238
  # 1. Primary Method: youtube-transcript-api
239
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
240
  try:
241
+ # Run the blocking IO call in a separate thread
242
  transcript_list = await asyncio.to_thread(
243
  YouTubeTranscriptApi.get_transcript,
244
  video_id,
245
+ languages=['en', 'en-GB', 'en-US'] # Prioritize English variants
246
  )
247
  if transcript_list:
248
  transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
249
+ # Clean up excessive whitespace that might result from joining
250
+ transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
251
  if transcript_text:
252
  logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
253
  return transcript_text
254
  else:
255
+ logger.warning(f"[Primary YT] Joined transcript text is empty after cleaning for {video_id}")
256
+ transcript_text = None # Explicitly set to None
257
  else:
258
+ logger.warning(f"[Primary YT] Transcript list was empty for {video_id}")
259
  transcript_text = None
260
  except Exception as e:
261
+ # Log the specific error type for better debugging
262
+ logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {type(e).__name__} - {e}")
263
+ # Be more specific about common errors
264
+ if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e):
265
+ logger.warning("[Primary YT] IP likely blocked by YouTube (Rate Limit / Cloud IP).")
266
+ elif "No transcript found" in str(e):
267
+ logger.warning(f"[Primary YT] No transcript available in specified languages for {video_id}.")
268
+ elif "TranscriptsDisabled" in str(e) or "disabled" in str(e):
269
+ logger.warning(f"[Primary YT] Transcripts are disabled for {video_id}.")
270
+ # Ensure transcript_text is None if any exception occurred
271
  transcript_text = None
272
 
273
  # 2. Fallback 1: Supadata API
274
  if transcript_text is None:
275
+ logger.info("[Fallback YT 1] Primary method failed or yielded no text. Trying Supadata API...")
276
  if supadata_key:
277
  transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
278
  if transcript_text:
279
+ logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id} (length: {len(transcript_text)})")
280
+ return transcript_text # Return immediately on success
281
  else:
282
  logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
283
+ # transcript_text remains None
284
  else:
285
  logger.warning("[Fallback YT 1] Supadata API key not available. Skipping.")
286
 
287
  # 3. Fallback 2: Apify API
288
  if transcript_text is None:
289
+ logger.info("[Fallback YT 2] Primary & Supadata failed or yielded no text. Trying Apify API...")
290
  if apify_token:
291
  transcript_text = await get_transcript_via_apify(video_url, apify_token)
292
  if transcript_text:
293
+ logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url} (length: {len(transcript_text)})")
294
+ return transcript_text # Return immediately on success
295
  else:
296
  logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
297
+ # transcript_text remains None
298
  else:
299
  logger.warning("[Fallback YT 2] Apify API token not available. Skipping.")
300
 
 
303
  logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
304
  return None
305
 
306
+ # This line should technically not be reached if logic is correct, but added for safety
307
  return transcript_text
308
 
309
  # Website Content via Requests/BS4 (Primary Method for Simplified Bot)
 
312
  if not url: logger.error("[Web Scraper - Requests/BS4] called with no URL"); return None
313
  logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
314
  try:
315
+ # Standard headers to mimic a browser
316
  headers = {
317
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
318
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
319
  'Accept-Language': 'en-US,en;q=0.9',
320
+ 'Connection': 'keep-alive',
321
+ 'DNT': '1', # Do Not Track
322
+ 'Upgrade-Insecure-Requests': '1'
323
+ }
324
+ logger.debug(f"[Web Scraper - Requests/BS4] Sending GET request to {url}")
325
+ # Run blocking requests call in thread
326
  response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
327
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
328
  logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
329
+
330
  content_type = response.headers.get('content-type', '').lower()
331
  if 'html' not in content_type:
332
+ logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}. Skipping parsing.")
333
+ # Maybe return raw text if it's plain text? Or just None.
334
+ if 'text/plain' in content_type and response.text:
335
+ logger.info(f"[Web Scraper - Requests/BS4] Returning plain text content for {url}")
336
+ return response.text.strip()
337
  return None
338
+
339
+ # Parse with BeautifulSoup
340
  soup = BeautifulSoup(response.text, 'html.parser')
341
+
342
+ # Remove common unwanted elements
343
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
344
+ element.extract() # Remove the tag from the tree
345
+
346
+ # Try to find the main content area (add more selectors if needed)
347
+ main_content = (
348
+ soup.find('main') or
349
+ soup.find('article') or
350
+ soup.find(id='content') or
351
+ soup.find(class_='content') or
352
+ soup.find(id='main-content') or
353
+ soup.find(class_='main-content') or
354
+ soup.find(role='main')
355
+ )
356
+
357
+ # Fallback to body if no specific main area found
358
  target_element = main_content if main_content else soup.body
359
+
360
  if not target_element:
361
  logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}")
362
+ # Try getting text from the whole soup as a last resort? Could be messy.
363
+ # raw_text = soup.get_text(separator='\n', strip=True)
364
+ # if raw_text: return "\n".join(line.strip() for line in raw_text.splitlines() if line.strip())
365
+ return None # Return None if body itself is missing
366
+
367
+ # Extract text, clean up lines, and join
368
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
369
  text = "\n".join(lines)
370
+
371
+ # Check if extracted text is reasonably long
372
+ MIN_TEXT_LENGTH = 50
373
+ if not text or len(text) < MIN_TEXT_LENGTH:
374
+ logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short (<{MIN_TEXT_LENGTH} chars) or empty after cleaning for {url} (Length: {len(text)})")
375
+ # Consider returning None if text is too short, might indicate failed extraction
376
+ # return None
377
+
378
  logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
379
  return text
380
+
381
+ except requests.exceptions.Timeout:
382
+ logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}"); return None
383
+ except requests.exceptions.TooManyRedirects:
384
+ logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}"); return None
385
+ except requests.exceptions.RequestException as e: # Catches HTTPError, ConnectionError etc.
386
+ logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}"); return None
387
+ except Exception as e: # Catch potential BS4 errors or others
388
+ logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True); return None
389
+
390
 
391
  # Website Content via URLToText API (Fallback Method)
392
  async def get_website_content_via_urltotext_api(url: str, api_key: str):
393
  """Fetches website content using the URLToText API (Fallback)."""
394
  if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
395
  if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
396
+
397
  logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}")
398
  api_endpoint = "https://urltotext.com/api/v1/urltotext/"
399
+ # Ensure payload is correctly formatted JSON string
400
+ payload = json.dumps({
401
+ "url": url,
402
+ "output_format": "text",
403
+ "extract_main_content": True, # Use their main content extraction
404
+ "render_javascript": True, # Important for dynamic sites
405
+ "residential_proxy": False # Usually not needed unless specifically blocked
406
+ })
407
+ headers = {
408
+ "Authorization": f"Token {api_key}",
409
+ "Content-Type": "application/json"
410
+ }
411
+
412
  try:
413
+ logger.debug(f"[Web Scraper - URLToText API] Sending POST request for {url}")
414
+ # Run blocking requests call in thread
415
+ response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45) # Slightly longer timeout
416
  logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
417
+
418
  if response.status_code == 200:
419
  try:
420
  data = response.json()
421
+ content = data.get("data", {}).get("content") # Navigate nested structure
422
  credits = data.get("credits_used", "N/A")
423
+ warning = data.get("data", {}).get("warning") # Check for warnings
424
+
425
+ if warning:
426
+ logger.warning(f"[Web Scraper - URLToText API] Warning received for {url}: {warning}")
427
+
428
+ if content:
429
+ logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}")
430
+ return content.strip()
431
+ else:
432
+ logger.warning(f"[Web Scraper - URLToText API] API returned success (200) but content was empty for {url}. Response: {data}")
433
+ return None # Return None if content is missing despite 200 OK
434
+
435
+ except json.JSONDecodeError:
436
+ logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Status: {response.status_code}. Response text: {response.text[:500]}...")
437
+ return None
438
+ except Exception as e:
439
+ logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True)
440
+ return None
441
+
442
+ # Handle specific error codes from URLToText API docs
443
+ elif response.status_code == 400: # Bad Request (e.g., invalid URL format)
444
+ logger.error(f"[Web Scraper - URLToText API] Bad Request (400) from API for {url}. Check URL/payload. Response: {response.text[:200]}...")
445
+ elif response.status_code == 401: # Unauthorized (Invalid API Key)
446
+ logger.error(f"[Web Scraper - URLToText API] Unauthorized (401) from API for {url}. Check API Key. Response: {response.text[:200]}...")
447
+ elif response.status_code == 402: # Payment Required (Credits exhausted)
448
+ logger.error(f"[Web Scraper - URLToText API] Payment Required (402) from API for {url}. Check credits. Response: {response.text[:200]}...")
449
+ elif response.status_code == 422: # Unprocessable Entity (e.g., URL cannot be reached)
450
+ logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL (422) reported by API for {url}. Response: {response.text[:200]}...")
451
+ elif response.status_code == 500: # Internal Server Error on their end
452
+ logger.error(f"[Web Scraper - URLToText API] Internal Server Error (500) from API for {url}. Response: {response.text[:200]}...")
453
+ else: # Catch-all for other unexpected codes
454
+ logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}...")
455
+ return None # Return None for all error cases
456
+
457
+ except requests.exceptions.Timeout:
458
+ logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}")
459
+ return None
460
+ except requests.exceptions.RequestException as e:
461
+ logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}")
462
+ return None
463
+ except Exception as e:
464
+ logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True)
465
+ return None
466
 
467
  # DeepSeek Summary Function (via OpenRouter)
468
  async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
469
  """Generates summary using DeepSeek via OpenRouter API."""
470
+ logger.info(f"Generating '{summary_type}' summary using DeepSeek/OpenRouter. Input text length: {len(text)}")
471
+ if not api_key:
472
+ logger.error("OpenRouter API key was not provided.")
473
+ return "Error: AI model configuration key is missing."
474
+ if not text:
475
+ logger.warning("generate_summary called with empty text.")
476
+ return "Error: No content provided to summarize."
477
+
478
  openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
479
+ # Using the free DeepSeek model - be mindful of potential rate limits or changes
480
  model_name = "deepseek/deepseek-chat:free"
481
+
482
+ # Define prompts based on summary type
483
+ if summary_type == "paragraph":
484
+ prompt = "Please provide a concise, well-written paragraph summarizing the key information and main points of the following text. Focus on capturing the essence of the content accurately."
485
+ elif summary_type == "points":
486
+ prompt = "Please summarize the following text into clear, distinct bullet points. Each point should highlight a key piece of information, finding, or main topic discussed. Aim for clarity and conciseness."
487
+ else:
488
+ logger.error(f"Invalid summary_type '{summary_type}' passed to generate_summary.")
489
+ return f"Error: Invalid summary type requested ('{summary_type}')."
490
+
491
+ # Check and truncate input text if necessary
492
+ # Max context varies per model, DeepSeek Chat's is large, but setting a practical limit is wise.
493
+ # Let's use a large but reasonable limit to avoid huge API calls.
494
+ MAX_INPUT_LENGTH = 500000 # Approx 500k characters
495
+ if len(text) > MAX_INPUT_LENGTH:
496
+ logger.warning(f"Input text length ({len(text)}) exceeds maximum limit ({MAX_INPUT_LENGTH}). Truncating.")
497
+ text = text[:MAX_INPUT_LENGTH] + "... (Content truncated due to length)"
498
+
499
+ full_prompt = f"{prompt}\n\n--- Start of Text ---\n\n{text}\n\n--- End of Text ---"
500
+
501
+ headers = {
502
+ "Authorization": f"Bearer {api_key}",
503
+ "Content-Type": "application/json",
504
+ # Recommended headers for OpenRouter when calling from identifiable services
505
+ "HTTP-Referer": "https://huggingface.co/spaces/YOUR_SPACE_NAME", # TODO: Replace with your actual space name if possible
506
+ "X-Title": "Telegram URL Summarizer Bot (HF Space)" # Or your bot's name
507
+ }
508
+
509
+ payload = json.dumps({
510
+ "model": model_name,
511
+ "messages": [
512
+ {"role": "user", "content": full_prompt}
513
+ ],
514
+ # Optional parameters (adjust if needed):
515
+ # "temperature": 0.7, # Controls randomness (optional)
516
+ # "max_tokens": 1024, # Limit response length (optional)
517
+ })
518
+
519
  try:
520
+ logger.debug(f"Sending request to OpenRouter (Model: {model_name})...")
521
+ # Run blocking requests call in thread
522
+ response = await asyncio.to_thread(requests.post,
523
+ openrouter_api_endpoint,
524
+ headers=headers,
525
+ data=payload,
526
+ timeout=90 # Increased timeout for potentially long generation
527
+ )
528
  logger.debug(f"Received status code {response.status_code} from OpenRouter.")
529
+
530
  if response.status_code == 200:
531
  try:
532
  data = response.json()
533
+ # Check the response structure carefully
534
+ if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
535
  message = data["choices"][0].get("message")
536
  if message and message.get("content"):
537
  summary = message["content"].strip()
538
+ # Basic check for empty or placeholder summary
539
+ if summary:
540
+ logger.info(f"Successfully generated summary via OpenRouter. Length: {len(summary)}")
541
+ return summary
542
+ else:
543
+ logger.warning(f"OpenRouter returned success (200) but summary content is empty. Response: {data}")
544
+ return "Sorry, the AI model returned an empty summary."
545
+ else:
546
+ logger.warning(f"OpenRouter success (200) but response structure missing expected content. Response: {data}")
547
+ return "Sorry, could not parse the AI model's response (missing content)."
548
+ # Handle cases where 'choices' might be missing or empty, or if there's an error object
549
+ elif data.get("error"):
550
+ error_details = data["error"]
551
+ logger.error(f"OpenRouter API Error ({response.status_code}): {error_details}")
552
+ return f"Sorry, the AI service reported an error: {error_details.get('message', 'Unknown error')}"
553
  else:
554
+ logger.error(f"OpenRouter success (200) but unexpected response structure (no choices/error). Response: {data}")
555
+ return "Sorry, could not parse the AI model's response (unexpected structure)."
556
+
557
+ except json.JSONDecodeError:
558
+ logger.error(f"Failed to decode JSON response from OpenRouter. Status: {response.status_code}. Response: {response.text[:500]}...")
559
+ return "Sorry, failed to understand the AI model's response format."
560
+ except Exception as e:
561
+ logger.error(f"Error processing successful OpenRouter response: {e}", exc_info=True)
562
+ return "Sorry, an error occurred while processing the AI model's response."
563
+
564
+ # Handle specific HTTP error codes
565
+ elif response.status_code == 401: # Unauthorized
566
+ logger.error("OpenRouter API key is invalid or unauthorized (401). Check the key in HF Secrets.")
567
+ return "Error: AI model configuration key is invalid."
568
+ elif response.status_code == 402: # Payment Required / Quota Exceeded
569
+ logger.error("OpenRouter Payment Required / Quota Exceeded (402). Check OpenRouter account status and limits.")
570
+ return "Sorry, there's an issue with the AI service account (limits or payment)."
571
+ elif response.status_code == 429: # Rate Limit Exceeded
572
+ logger.warning("OpenRouter Rate Limit Exceeded (429). Need to wait or slow down requests.")
573
+ return "Sorry, the AI model service is busy right now. Please try again in a moment."
574
+ elif response.status_code >= 500: # Server Error on OpenRouter's side
575
+ logger.error(f"OpenRouter Internal Server Error ({response.status_code}). Response: {response.text[:500]}...")
576
+ return "Sorry, the AI model service encountered an internal error. Please try again later."
577
+ else: # Catch-all for other client-side or unexpected errors
578
+ logger.error(f"Unexpected status code {response.status_code} received from OpenRouter. Response: {response.text[:500]}...")
579
+ try:
580
+ # Try to parse error message from response if possible
581
+ error_data = response.json()
582
+ error_msg = error_data.get("error", {}).get("message", response.text[:100])
583
+ return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
584
+ except:
585
+ # Fallback if response is not JSON or doesn't have expected error structure
586
+ return f"Sorry, the AI service returned an unexpected status code ({response.status_code})."
587
+
588
+ except requests.exceptions.Timeout:
589
+ logger.error("Timeout occurred while connecting to OpenRouter.")
590
+ return "Sorry, the request to the AI model timed out. Please try again."
591
+ except requests.exceptions.RequestException as e:
592
+ logger.error(f"Network error connecting to OpenRouter: {e}")
593
+ return "Sorry, there was a network error connecting to the AI model service."
594
+ except Exception as e:
595
+ logger.error(f"Unexpected error occurred in generate_summary function: {e}", exc_info=True)
596
+ return "Sorry, an unexpected internal error occurred while generating the summary."
597
+
598
 
599
  # --- Telegram Bot Handlers ---
600
+
601
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
602
+ """Handles the /start command."""
603
+ user = update.effective_user
604
+ if not user: return # Should not happen but good practice
605
+ logger.info(f"User {user.id} ({user.username or 'NoUsername'}) triggered /start.")
606
+ # Use mention_html for a clickable link if username exists, otherwise just first name
607
  mention = user.mention_html() if user.username else user.first_name
608
+ await update.message.reply_html(
609
+ f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\n"
610
+ "Just send me a valid link (starting with http:// or https://) and I'll ask you how you want it summarized."
611
+ )
612
+
613
  async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
614
+ """Handles the /help command."""
615
+ user = update.effective_user
616
+ logger.info(f"User {user.id if user else 'Unknown'} triggered /help.")
617
+ help_text = (
618
+ "**How I Work:**\n\n"
619
+ "1. Send me a full URL (starting with `http://` or `https://`).\n"
620
+ "2. I'll detect if it's a YouTube video link or a general website URL.\n"
621
+ "3. I'll ask if you want a **Paragraph** summary or **Points** summary.\n"
622
+ "4. Choose your preferred format by clicking the button.\n"
623
+ "5. I'll fetch the content (transcript for YouTube, text for websites) and use an AI model (via OpenRouter) to generate the summary.\n\n"
624
+ "**Troubleshooting:**\n"
625
+ "- **YouTube:** Sometimes transcripts aren't available (private video, no captions, disabled). I use multiple methods (library, Supadata, Apify) to try and get them.\n"
626
+ "- **Websites:** Complex websites with lots of JavaScript might be difficult to scrape accurately. I use a primary scraping method and a fallback API (URLToText) if needed.\n"
627
+ "- **AI Errors:** Occasionally, the AI model might be busy or encounter an error. You can try again later.\n\n"
628
+ "Just send a link to get started!"
629
+ )
630
+ await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
631
+
632
  async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
633
+ """Handles messages containing potential URLs."""
634
+ if not update.message or not update.message.text:
635
+ return # Ignore empty messages
636
+
637
+ message_text = update.message.text.strip()
638
+ user = update.effective_user
639
+ if not user: return
640
+
641
+ # Simple check for potential URLs - refine if needed
642
+ # Looks for http:// or https:// followed by some characters and a dot.
643
+ url_pattern = r'https?://[^\s/$.?#].[^\s]*'
644
+ match = re.search(url_pattern, message_text)
645
+
646
+ if match:
647
+ url = match.group(0) # Extract the first matched URL
648
+ logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}")
649
+
650
+ # Store the URL in user_data, associated with the user ID
651
+ context.user_data['url_to_summarize'] = url
652
+ logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
653
+
654
+ # Define the inline keyboard buttons
655
+ keyboard = [
656
+ [
657
+ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"),
658
+ InlineKeyboardButton("Points Summary", callback_data="points")
659
+ ]
660
+ ]
661
+ reply_markup = InlineKeyboardMarkup(keyboard)
662
+
663
+ # Reply to the user, asking for the summary type
664
+ await update.message.reply_text(
665
+ f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?",
666
+ reply_markup=reply_markup,
667
+ # disable_web_page_preview=True # Good practice to avoid double previews
668
+ link_preview_options={'is_disabled': True} # Updated way for PTB v20+
669
+ )
670
+ else:
671
+ # If the message doesn't look like a URL, optionally reply or just ignore
672
+ logger.debug(f"Ignoring non-URL message from user {user.id}: {message_text[:100]}")
673
+ # Optional: Reply if you want to guide the user
674
+ # await update.message.reply_text("Please send me a valid URL starting with http:// or https://")
675
+
676
+
677
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
678
+ """Handles button presses for choosing the summary type."""
679
+ query = update.callback_query
680
+ if not query: return
681
+ user = query.from_user # Get user from the callback query
682
+
683
+ # --- 1. Answer the callback query ---
684
+ # It's crucial to answer quickly to remove the "loading" state on the button.
685
+ try:
686
+ await query.answer()
687
+ logger.debug(f"Answered callback query {query.id}")
688
+ except Exception as e:
689
+ # This can happen if the query is too old, but log it.
690
+ logger.error(f"Failed to answer callback query {query.id}: {e}")
691
+ # Don't stop processing, but be aware the button might stay loading for the user.
692
+
693
+ # --- 2. Get data and context ---
694
+ summary_type = query.data # e.g., "paragraph" or "points"
695
+ # Retrieve the URL stored earlier for this user
696
  url = context.user_data.get('url_to_summarize')
697
+
698
+ logger.info(f"User {user.id} ({user.username or 'NoUsername'}) chose '{summary_type}' summary. Checking for URL '{url}' in context.")
699
+
700
+ if not url:
701
+ logger.warning(f"User {user.id} pressed button, but NO URL was found in user_data context. Query data: {summary_type}")
702
+ # Edit the original message to inform the user
703
+ try:
704
+ await query.edit_message_text(text="Sorry, I seem to have lost the context. 🤔 Please send the link again.")
705
+ except Exception as edit_err:
706
+ logger.error(f"Failed to edit message after lost context error: {edit_err}")
707
+ return # Stop processing
708
+
709
+ # Clear the URL from context once we start processing it
710
+ context.user_data.pop('url_to_summarize', None)
711
+ logger.debug(f"Retrieved and cleared URL {url} from user_data for user {user.id}")
712
+
713
+ # --- 3. Check for required API keys ---
714
+ # Re-fetch keys inside handler in case they were updated (less critical here, but good practice)
715
+ current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
716
+ current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
717
+ current_supadata_key = os.environ.get('SUPADATA_API_KEY')
718
+ current_apify_token = os.environ.get('APIFY_API_TOKEN')
719
+
720
+ logger.debug(f"Reading API keys from environment variables within handler...")
721
+ logger.debug(f"Keys read: OpenRouter={'Yes' if current_openrouter_key else 'No'}, "
722
+ f"URLToText={'Yes' if current_urltotext_key else 'No'}, "
723
+ f"Supadata={'Yes' if current_supadata_key else 'No'}, "
724
+ f"Apify={'Yes' if current_apify_token else 'No'}")
725
+
726
+
727
+ if not current_openrouter_key:
728
+ logger.error("OpenRouter API key is missing in environment variables. Cannot generate summary.")
729
+ try:
730
+ await query.edit_message_text(text="⚠️ Error: The AI summarization service is not configured correctly (missing API key). Please contact the bot admin.")
731
+ except Exception as edit_err:
732
+ logger.error(f"Failed to edit message about missing OpenRouter key: {edit_err}")
733
+ return
734
+
735
+ # --- 4. Edit message to show "Processing..." ---
736
+ processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n{url}\n\nThis might take a moment..."
737
+ message_to_delete_later_id = None
738
  try:
739
+ # Edit the original message where the buttons were
740
+ await query.edit_message_text(text=processing_message_text)
741
+ logger.debug(f"Edited message for query {query.id} to show processing status.")
742
+ except Exception as e:
743
+ # If editing fails (e.g., message too old, deleted), send a new message
744
+ logger.warning(f"Could not edit original message {query.message.message_id if query.message else 'N/A'} (query {query.id}): {e}. Sending a new status message.")
745
+ try:
746
+ status_message = await context.bot.send_message(chat_id=user.id, text=processing_message_text)
747
+ message_to_delete_later_id = status_message.message_id
748
+ logger.debug(f"Sent new status message {message_to_delete_later_id}")
749
+ except Exception as send_err:
750
+ logger.error(f"Failed even to send a new status message: {send_err}")
751
+ # Can't easily inform the user now, just log.
752
+
753
+ # --- 5. Fetch Content (YouTube or Website) ---
754
+ content = None
755
+ user_feedback_message = None # Store potential error messages for the user
756
+ success = False # Track overall success
757
+
758
+ try:
759
+ # Send "typing..." action to indicate activity
760
+ logger.debug(f"Sending 'typing' action for chat {user.id}")
761
+ await context.bot.send_chat_action(chat_id=user.id, action='typing')
762
+
763
+ is_yt = is_youtube_url(url)
764
+ logger.debug(f"URL determined to be YouTube: {is_yt}")
765
+
766
+ if is_yt:
767
  video_id = extract_youtube_id(url)
768
  if video_id:
769
+ logger.info(f"Fetching YouTube transcript for video_id: {video_id}")
770
+ content = await get_youtube_transcript(video_id, url, current_supadata_key, current_apify_token)
771
+ if not content:
772
+ # Provide a more informative error if all YT methods fail
773
+ user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
774
+ logger.info(f"YouTube transcript fetch completed. Content found: {bool(content)}")
775
+ else:
776
+ logger.warning(f"Failed to extract video ID from supposedly YouTube URL: {url}")
777
+ user_feedback_message = "Sorry, I couldn't properly identify the YouTube video ID from the link."
778
  else:
779
+ # Try fetching website content
780
+ logger.info(f"Attempting website scrape (Requests/BS4) for: {url}")
781
+ content = await get_website_content_via_requests(url)
782
+
783
+ if content:
784
+ logger.info("Website scrape (Requests/BS4) successful.")
785
+ user_feedback_message = None # Clear any previous potential message
786
  else:
787
+ logger.warning(f"Primary website scrape (Requests/BS4) failed for {url}. Trying fallback API (URLToText)...")
788
  if current_urltotext_key:
789
+ # Send typing action again before potentially long API call
790
+ await context.bot.send_chat_action(chat_id=user.id, action='typing')
791
+ content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
792
+ if content:
793
+ logger.info("Website scrape fallback (URLToText API) successful.")
794
+ user_feedback_message = None
795
+ else:
796
+ logger.warning(f"Fallback website scrape (URLToText API) also failed for {url}.")
797
+ user_feedback_message = "Sorry, I couldn't fetch the content from that website using either the standard method or the fallback API."
798
+ else:
799
+ logger.warning("URLToText API key is not configured. Cannot use fallback.")
800
+ user_feedback_message = "Sorry, I couldn't fetch the content from that website (primary method failed, and fallback is not configured)."
801
+
802
+ # --- 6. Generate Summary if Content Exists ---
803
  if content:
804
+ logger.info("Content retrieved successfully. Proceeding to generate summary.")
805
+ # Send typing action again before AI call
806
+ await context.bot.send_chat_action(chat_id=user.id, action='typing')
807
+
808
  summary = await generate_summary(content, summary_type, current_openrouter_key)
809
+
810
+ # Check if the summary generation itself returned an error message
811
+ if summary.startswith("Error:") or summary.startswith("Sorry,"):
812
+ user_feedback_message = summary # Use the error from generate_summary
813
+ logger.warning(f"Summary generation failed or returned an error message: {summary}")
814
+ else:
815
+ # Success! Send the summary.
816
+ logger.info("Summary generated successfully. Sending to user.")
817
+ await context.bot.send_message(
818
+ chat_id=user.id,
819
+ text=summary,
820
+ parse_mode=ParseMode.MARKDOWN, # Assume summary might contain markdown
821
+ # disable_web_page_preview=True
822
+ link_preview_options={'is_disabled': True}
823
+ )
824
+ success = True # Mark as successful
825
+ user_feedback_message = None # Clear any potential previous error
826
+ elif not user_feedback_message:
827
+ # If content is None and no specific error message was set, provide a generic one
828
+ logger.error(f"Content fetching resulted in None, but no specific user feedback message was set for URL: {url}")
829
+ user_feedback_message = "Sorry, I was unable to retrieve any content from the provided link."
830
+
831
+ # --- 7. Send Final Feedback (if error occurred) ---
832
+ if user_feedback_message and not success:
833
+ logger.warning(f"Sending failure feedback message to user {user.id}: {user_feedback_message}")
834
+ await context.bot.send_message(
835
+ chat_id=user.id,
836
+ text=user_feedback_message
837
+ )
838
+
839
+ except Exception as e:
840
+ # Catch any unexpected errors during the whole process
841
+ logger.error(f"An unexpected error occurred in handle_summary_type_callback for user {user.id}, URL {url}: {e}", exc_info=True)
842
+ try:
843
+ # Try to inform the user about the internal error
844
+ await context.bot.send_message(chat_id=user.id, text=" माफी माग्छु ! An unexpected internal error occurred while processing your request. The developers have been notified.") # माफी माग्छु = Oops!
845
+ except Exception as final_err:
846
+ logger.error(f"Failed to even send the final error message to user {user.id}: {final_err}")
847
+
848
  finally:
849
+ # --- 8. Clean up the "Processing..." message ---
850
+ # This runs whether success or failure, unless an error prevented it
851
+ logger.debug("Callback handler finished. Cleaning up status message (if possible)...")
852
  try:
853
+ if message_to_delete_later_id:
854
+ # If we sent a separate status message, delete it
855
+ await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later_id)
856
+ logger.debug(f"Deleted separate status message {message_to_delete_later_id}.")
857
+ elif query.message:
858
+ # If we successfully edited the original message, delete it now that we're done
859
+ # (or you could edit it to "Summary complete!" or the error message)
860
+ # Deleting is cleaner usually.
861
+ await query.delete_message()
862
+ logger.debug(f"Deleted original message (via query {query.id}, message_id {query.message.message_id}).")
863
+ except Exception as del_err:
864
+ # Log if deletion fails, but don't crash
865
+ logger.warning(f"Could not delete status/button message: {del_err}")
866
+
867
+
868
+ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
869
+ """Log Errors caused by Updates."""
870
+ logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
871
+
872
+ # Optional: Inform user about generic errors (be cautious with this)
873
+ # try:
874
+ # if isinstance(update, Update) and update.effective_chat:
875
+ # await context.bot.send_message(
876
+ # chat_id=update.effective_chat.id,
877
+ # text="Sorry, something went wrong processing your request."
878
+ # )
879
+ # except Exception as e:
880
+ # logger.error(f"Failed to send error message to chat: {e}")
881
+
882
 
883
  # --- Bot Application Setup Function ---
884
+ # Make setup_bot an async function as Application.initialize() is async
885
  async def setup_bot():
886
+ """Initializes and configures the Telegram Bot Application."""
887
+ logger.info("Setting up Telegram Application...")
888
+ if not TELEGRAM_TOKEN:
889
+ logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found. Bot cannot start.")
890
+ return None
891
+
892
+ # Use Application.builder() for setup
893
  application = Application.builder().token(TELEGRAM_TOKEN).build()
894
+
895
+ # --- Register Handlers ---
896
+ # Command Handlers
897
+ application.add_handler(CommandHandler("start", start))
898
+ application.add_handler(CommandHandler("help", help_command))
899
+
900
+ # Message Handler for URLs (ensure it doesn't catch commands)
901
+ # Using filters.TEXT & ~filters.COMMAND & filters.Entity("url") | filters.Entity("text_link") might be more precise
902
+ # But a simple text check + regex inside the handler is often robust enough.
903
+ application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
904
+
905
+ # Callback Query Handler for button presses
906
+ application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
907
+
908
+ # Error Handler (registers the function defined above)
909
+ application.add_error_handler(error_handler)
910
+
911
+ logger.info("Telegram handlers registered.")
912
+
913
+ # Initialize the application (fetches bot info, etc.)
914
+ # This is now done automatically when application runs, but explicit call ensures it happens early
915
+ try:
916
+ logger.info("Running application.initialize()...")
917
+ await application.initialize()
918
+ logger.info("Finished application.initialize(). Bot details: %s", application.bot.username)
919
+ except Exception as e:
920
+ logger.critical(f"Failed to initialize Telegram application: {e}", exc_info=True)
921
+ return None # Indicate failure
922
+
923
  return application
 
924
 
925
+ # --- Global Application Instance ---
926
+ # Run setup_bot once at startup
927
+ logger.info("Running bot setup at startup...")
928
+ # Use asyncio.get_event_loop().run_until_complete() if running outside an existing async context
929
+ # In newer Python versions, asyncio.run() is preferred but might cause issues if nested.
930
+ # Since this is top-level before Flask/Gunicorn starts its loop, it should be okay.
931
+ ptb_app: Application | None = asyncio.run(setup_bot())
932
+ logger.info(f"Bot setup finished. Application instance: {'OK' if ptb_app else 'FAILED'}")
933
+
934
+
935
+ # --- Flask App Setup (for Webhook) ---
936
+ app = Flask(__name__)
937
+ logger.info("Flask app created.")
938
+
939
+ @app.route('/')
940
+ def index():
941
+ """Basic health check endpoint."""
942
+ logger.debug("Health check '/' accessed.")
943
+ bot_status = "Initialized" if ptb_app and ptb_app.bot else "Initialization FAILED"
944
+ return f"Telegram Bot Webhook Listener ({bot_status}) is running."
945
 
 
946
  @app.route('/webhook', methods=['POST'])
947
  async def webhook() -> Response:
948
+ """Webhook endpoint for Telegram updates."""
949
+ if not ptb_app:
950
+ logger.error("Webhook triggered, but Telegram Application (ptb_app) is not initialized.")
951
+ return Response('Bot not configured correctly.', status=500)
952
+
953
+ logger.debug("Webhook request received (POST)...")
954
  if request.is_json:
955
  try:
956
+ update_data = request.get_json()
957
+ update = Update.de_json(update_data, ptb_app.bot)
958
+ logger.debug(f"Processing update ID: {update.update_id} via webhook")
959
+
960
+ # Crucial: Queue the update for processing by PTB's internal mechanisms
961
+ # This ensures proper handling within PTB's async context
962
+ # Use await ptb_app.process_update(update) directly as Flask is run by Uvicorn
963
+ await ptb_app.process_update(update)
964
+ logger.debug(f"Finished processing update ID: {update.update_id}")
965
+
966
+ return Response('ok', status=200) # Acknowledge receipt to Telegram
967
+
968
+ except json.JSONDecodeError:
969
+ logger.error("Failed to decode JSON from Telegram webhook request.")
970
+ return Response('Bad Request: Invalid JSON', status=400)
971
+ except Exception as e:
972
+ logger.error(f"Error processing update in webhook: {e}", exc_info=True)
973
+ # Avoid sending detailed errors back in the response
974
+ return Response('Internal Server Error processing update.', status=500)
975
+ else:
976
+ logger.warning("Received non-JSON request to webhook endpoint.")
977
+ return Response('Bad Request: Expected JSON', status=400)
978
 
979
+ # --- Main Execution Block (for local testing ONLY) ---
980
  if __name__ == '__main__':
981
+ # This block is NOT used when running with Gunicorn/Uvicorn in production
982
+ logger.warning("Running Flask development server directly (for local testing only).")
983
+ if not ptb_app:
984
+ logger.critical("Aborting local Flask start: Telegram App (ptb_app) failed initialization.")
985
+ else:
986
+ # For local testing, you might need to set a webhook manually or use polling.
987
+ # Flask's dev server can work with asyncio now.
988
+ # Running with debug=True enables auto-reloading and more detailed errors.
989
+ local_port = int(os.environ.get('PORT', 8080)) # Use a different port locally?
990
+ logger.info(f"Flask dev server starting on http://0.0.0.0:{local_port}")
991
+ # Note: Flask's built-in server is NOT recommended for production.
992
+ # Use Gunicorn + Uvicorn worker as configured in gunicorn.conf.py for deployment.
993
+ app.run(host='0.0.0.0', port=local_port, debug=True)