fmab777 commited on
Commit
e9e9e3a
·
verified ·
1 Parent(s): 3effada
Files changed (1) hide show
  1. main.py +879 -858
main.py CHANGED
@@ -1,858 +1,879 @@
1
- # main.py
2
- import os
3
- import re
4
- import logging
5
- import asyncio
6
- import json
7
- from flask import Flask, request, Response # For web server
8
-
9
- from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
10
- from telegram.ext import (
11
- Application,
12
- CommandHandler,
13
- MessageHandler,
14
- filters,
15
- ContextTypes,
16
- CallbackQueryHandler
17
- )
18
- from telegram.constants import ParseMode # Import ParseMode explicitly
19
-
20
- # Import specific libraries (Ensure these are covered in requirements.txt)
21
- from youtube_transcript_api import YouTubeTranscriptApi
22
- import requests
23
- from bs4 import BeautifulSoup
24
- # Only import ApifyClient if you might use it (i.e., have the secret)
25
- if os.environ.get('APIFY_API_TOKEN'):
26
- from apify_client import ApifyClient
27
- else:
28
- ApifyClient = None # Define it as None if not used, to avoid errors later
29
-
30
- # Apply nest_asyncio early, can help prevent event loop conflicts in web frameworks
31
- import nest_asyncio
32
- nest_asyncio.apply()
33
-
34
- # --- Environment Variable Loading (using Replit Secrets) ---
35
- TELEGRAM_TOKEN = os.environ.get('TELEGRAM_TOKEN')
36
- OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY')
37
- URLTOTEXT_API_KEY = os.environ.get('URLTOTEXT_API_KEY') # Will be None if not set in Secrets
38
- SUPADATA_API_KEY = os.environ.get('SUPADATA_API_KEY') # Will be None if not set in Secrets
39
- APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN') # Will be None if not set in Secrets
40
-
41
- # --- Logging Setup ---
42
- logging.basicConfig(
43
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
44
- level=logging.INFO # Set to INFO to see processing steps, DEBUG for more detail
45
- )
46
- # Reduce noise from libraries
47
- logging.getLogger("httpx").setLevel(logging.WARNING)
48
- if ApifyClient: # Only set level if imported
49
- logging.getLogger("apify_client").setLevel(logging.WARNING)
50
- logging.getLogger("telegram.ext").setLevel(logging.INFO)
51
- logging.getLogger('telegram.bot').setLevel(logging.INFO)
52
- logging.getLogger("urllib3").setLevel(logging.INFO) # From requests
53
- logger = logging.getLogger(__name__)
54
-
55
- # --- Input Validation ---
56
- if not TELEGRAM_TOKEN:
57
- logger.critical("FATAL: TELEGRAM_TOKEN environment variable not set in Replit Secrets!")
58
- # In a deployed environment, exiting might not be helpful. Log critical error.
59
- # exit("Telegram Token Missing")
60
- else:
61
- logger.info("Telegram Token loaded.")
62
-
63
- if not OPENROUTER_API_KEY:
64
- # Log error but allow running, summaries will just fail gracefully later
65
- logger.error("OpenRouter API Key not set in Replit Secrets! Summarization will fail.")
66
- else:
67
- logger.info("OpenRouter API Key loaded.")
68
-
69
- # Log presence of optional keys
70
- if URLTOTEXT_API_KEY: logger.info("URLToText API Key found.")
71
- else: logger.warning("URLToText API Key not found. Website fallback 2 unavailable.")
72
- if SUPADATA_API_KEY: logger.info("Supadata API Key found.")
73
- else: logger.warning("Supadata API Key not found. YT Transcript fallback 1 unavailable.")
74
- if APIFY_API_TOKEN: logger.info("Apify API Token found.")
75
- else: logger.warning("Apify API Token not found. YT Transcript fallback 2 unavailable.")
76
-
77
-
78
- # --- Bot Logic Functions (Simplified Version - No Crawl4AI) ---
79
-
80
- # Helper Functions
81
- def is_youtube_url(url):
82
- """Checks if the URL is a valid YouTube video or shorts URL."""
83
- youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
84
- return bool(re.search(youtube_regex, url))
85
-
86
- def extract_youtube_id(url):
87
- """Extracts the YouTube video ID from a URL."""
88
- youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
89
- match = re.search(youtube_id_regex, url)
90
- if match:
91
- return match.group(1)
92
- logger.warning(f"Could not extract YouTube ID from URL: {url}")
93
- return None
94
-
95
- # Supadata Transcript Fetching
96
- async def get_transcript_via_supadata(video_id: str, api_key: str):
97
- """Fetches YouTube transcript via Supadata API."""
98
- if not video_id: logger.error("[Supadata] get_transcript_via_supadata called with no video_id"); return None
99
- if not api_key: logger.error("[Supadata] API key is missing."); return None # Already checked before calling
100
- logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
101
- api_endpoint = f"https://api.supadata.net/v1/youtube/transcript"
102
- params = {"videoId": video_id, "format": "text"}
103
- headers = {"X-API-Key": api_key}
104
- try:
105
- # Use asyncio.to_thread to run blocking requests.get in a separate thread
106
- response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30)
107
- logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
108
- if response.status_code == 200:
109
- try:
110
- data = response.json()
111
- content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
112
- if content and isinstance(content, str):
113
- logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
114
- return content.strip()
115
- else:
116
- logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
117
- return None
118
- except json.JSONDecodeError: # Handle cases where API might return plain text on success
119
- if response.text:
120
- logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
121
- return response.text.strip()
122
- else:
123
- logger.error(f"[Supadata] Failed to decode JSON response (and no text body) for {video_id}. Response: {response.text[:200]}...")
124
- return None
125
- except Exception as e:
126
- logger.error(f"[Supadata] Error processing successful response for {video_id}: {e}", exc_info=True)
127
- return None
128
- elif response.status_code in [401, 403]:
129
- logger.error(f"[Supadata] Authentication error ({response.status_code}). Check API key.")
130
- return None # Don't retry if key is bad
131
- elif response.status_code == 404:
132
- logger.warning(f"[Supadata] Transcript not found ({response.status_code}) for {video_id}.")
133
- return None
134
- else:
135
- logger.error(f"[Supadata] Unexpected status code {response.status_code} for {video_id}. Response: {response.text[:200]}...")
136
- return None
137
- except requests.exceptions.Timeout:
138
- logger.error(f"[Supadata] Timeout error connecting to API for {video_id}")
139
- return None
140
- except requests.exceptions.RequestException as e:
141
- logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
142
- return None
143
- except Exception as e:
144
- logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
145
- return None
146
-
147
- # Apify Transcript Fetching
148
- async def get_transcript_via_apify(video_url: str, api_token: str):
149
- """Fetches YouTube transcript via Apify API."""
150
- if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
151
- if not api_token: logger.error("[Apify] API token is missing."); return None # Already checked
152
- if not ApifyClient: logger.error("[Apify] ApifyClient not available/imported."); return None
153
-
154
- logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
155
- actor_id = "karamelo~youtube-transcripts"
156
- api_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
157
- params = {"token": api_token}
158
- payload = json.dumps({
159
- "urls": [video_url],
160
- "outputFormat": "singleStringText",
161
- "maxRetries": 5,
162
- "channelHandleBoolean": False,
163
- "channelNameBoolean": False,
164
- "datePublishedBoolean": False,
165
- "relativeDateTextBoolean": False,
166
- })
167
- headers = {"Content-Type": "application/json"}
168
- try:
169
- logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
170
- response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90) # Longer timeout for actor run
171
- logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
172
- if response.status_code == 200:
173
- try:
174
- results = response.json()
175
- if isinstance(results, list) and len(results) > 0:
176
- item = results[0]
177
- content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
178
- if not content and item.get("captions") and isinstance(item["captions"], list): # Handle 'captions' format if primary keys fail
179
- logger.info("[Apify] Processing 'captions' format.")
180
- content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
181
- if content and isinstance(content, str):
182
- logger.info(f"[Apify] Successfully fetched transcript for {video_url}. Length: {len(content)}")
183
- return content.strip()
184
- else:
185
- logger.warning(f"[Apify] Actor run successful but transcript content not found/empty in result for {video_url}. Result item: {item}")
186
- return None
187
- else:
188
- logger.warning(f"[Apify] Actor run successful but dataset was empty for {video_url}. Response: {results}")
189
- return None
190
- except json.JSONDecodeError:
191
- logger.error(f"[Apify] Failed to decode JSON response for {video_url}. Status: {response.status_code}. Response text: {response.text[:200]}...")
192
- return None
193
- except Exception as e:
194
- logger.error(f"[Apify] Error processing successful response for {video_url}: {e}", exc_info=True)
195
- return None
196
- elif response.status_code == 400:
197
- logger.error(f"[Apify] Bad Request (400) for {video_url}. Check input payload. Response: {response.text[:200]}...")
198
- return None
199
- elif response.status_code == 401:
200
- logger.error("[Apify] Authentication error (401). Check API token.")
201
- return None # Don't retry if token is bad
202
- else:
203
- logger.error(f"[Apify] Unexpected status code {response.status_code} for {video_url}. Response: {response.text[:200]}...")
204
- return None
205
- except requests.exceptions.Timeout:
206
- logger.error(f"[Apify] Timeout error running actor for {video_url}")
207
- return None
208
- except requests.exceptions.RequestException as e:
209
- logger.error(f"[Apify] Request error running actor for {video_url}: {e}")
210
- return None
211
- except Exception as e:
212
- logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True)
213
- return None
214
-
215
- # Combined YouTube Transcript Function (with Fallbacks)
216
- async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
217
- """Fetches YouTube transcript using library, then Supadata, then Apify."""
218
- if not video_id: logger.error("get_youtube_transcript called with no video_id"); return None
219
- logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
220
- transcript_text = None
221
-
222
- # 1. Primary Method: youtube-transcript-api
223
- logger.info("[Primary YT] Attempting youtube-transcript-api...")
224
- try:
225
- # Run synchronous library call in a thread
226
- transcript_list = await asyncio.to_thread(
227
- YouTubeTranscriptApi.get_transcript,
228
- video_id,
229
- languages=['en', 'en-GB', 'en-US'] # Prioritize English variations
230
- )
231
- if transcript_list:
232
- transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
233
- if transcript_text:
234
- logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
235
- return transcript_text # Return immediately on success
236
- else:
237
- logger.warning(f"[Primary YT] Joined transcript text is empty for {video_id}")
238
- transcript_text = None # Ensure it's None if empty after join
239
- else:
240
- logger.warning(f"[Primary YT] Transcript list empty for {video_id}")
241
- transcript_text = None
242
- except Exception as e:
243
- logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {e}")
244
- if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found for {video_id}. May be unavailable/private.")
245
- elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled for {video_id}.")
246
- transcript_text = None # Ensure it's None on error
247
-
248
- # 2. Fallback 1: Supadata API
249
- if transcript_text is None:
250
- logger.info("[Fallback YT 1] Primary method failed. Trying Supadata API...")
251
- if supadata_key:
252
- transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
253
- if transcript_text:
254
- logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id}")
255
- return transcript_text # Return on success
256
- else:
257
- logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
258
- else:
259
- logger.warning("[Fallback YT 1] Supadata API key not available. Skipping.")
260
-
261
- # 3. Fallback 2: Apify API
262
- if transcript_text is None:
263
- logger.info("[Fallback YT 2] Primary & Supadata failed. Trying Apify API...")
264
- if apify_token:
265
- transcript_text = await get_transcript_via_apify(video_url, apify_token)
266
- if transcript_text:
267
- logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url}")
268
- return transcript_text # Return on success
269
- else:
270
- logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
271
- else:
272
- logger.warning("[Fallback YT 2] Apify API token not available. Skipping.")
273
-
274
- # If all methods failed
275
- if transcript_text is None:
276
- logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
277
- return None
278
-
279
- # Should not be reached if logic above is correct, but as a safeguard
280
- return transcript_text
281
-
282
- # Website Content via Requests/BS4 (Primary Method for Simplified Bot)
283
- async def get_website_content_via_requests(url):
284
- """Attempts to scrape website content using requests/BeautifulSoup (Primary Method)."""
285
- if not url: logger.error("[Web Scraper - Requests/BS4] called with no URL"); return None
286
- logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
287
- try:
288
- headers = {
289
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', # Updated UA
290
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
291
- 'Accept-Language': 'en-US,en;q=0.9',
292
- 'Connection': 'keep-alive',
293
- 'DNT': '1', # Do Not Track
294
- 'Upgrade-Insecure-Requests': '1'
295
- }
296
- logger.debug(f"[Web Scraper - Requests/BS4] Sending request to {url}")
297
- # Run blocking I/O in a separate thread
298
- response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
299
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
300
- logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
301
-
302
- content_type = response.headers.get('content-type', '').lower()
303
- if 'html' not in content_type:
304
- logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}")
305
- return None # Don't try to parse non-html
306
-
307
- # Use html.parser, it's built-in
308
- soup = BeautifulSoup(response.text, 'html.parser')
309
-
310
- # Remove common unwanted tags more aggressively
311
- for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
312
- element.extract()
313
-
314
- # Try finding common main content containers
315
- main_content = soup.find('main') or \
316
- soup.find('article') or \
317
- soup.find(id='content') or \
318
- soup.find(class_='content') or \
319
- soup.find(id='main-content') or \
320
- soup.find(class_='main-content') or \
321
- soup.find(role='main')
322
-
323
- # Fallback to body if no specific container found
324
- target_element = main_content if main_content else soup.body
325
-
326
- if not target_element:
327
- logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}")
328
- return None # Nothing to parse
329
-
330
- # Get text, joining lines smartly
331
- lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
332
- text = "\n".join(lines) # Join with newlines to preserve some structure
333
-
334
- # Basic length check
335
- if not text or len(text) < 50: # Arbitrary short length check
336
- logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short or empty after cleaning for {url} (Length: {len(text)})")
337
- # Consider returning None if too short, depends on use case
338
- # return None
339
-
340
- logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
341
- return text
342
-
343
- except requests.exceptions.Timeout:
344
- logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}")
345
- return None
346
- except requests.exceptions.TooManyRedirects:
347
- logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}")
348
- return None
349
- except requests.exceptions.RequestException as e:
350
- # This catches ConnectTimeout, HTTPError, ConnectionError etc.
351
- logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}")
352
- return None
353
- except Exception as e:
354
- # Catch-all for unexpected errors during parsing etc.
355
- logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True)
356
- return None
357
-
358
- # Website Content via URLToText API (Fallback Method)
359
- async def get_website_content_via_urltotext_api(url: str, api_key: str):
360
- """Fetches website content using the URLToText API (Fallback)."""
361
- if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
362
- if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None # Already checked
363
- logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}")
364
- api_endpoint = "https://urltotext.com/api/v1/urltotext/"
365
- payload = json.dumps({
366
- "url": url,
367
- "output_format": "text",
368
- "extract_main_content": True,
369
- "render_javascript": True, # Often needed for modern sites
370
- "residential_proxy": False, # Start with standard
371
- })
372
- headers = {
373
- "Authorization": f"Token {api_key}",
374
- "Content-Type": "application/json"
375
- }
376
- try:
377
- logger.debug(f"[Web Scraper - URLToText API] Sending request for {url}")
378
- response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45)
379
- logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
380
- if response.status_code == 200:
381
- try:
382
- data = response.json()
383
- content = data.get("data", {}).get("content")
384
- credits = data.get("credits_used", "N/A")
385
- warning = data.get("data", {}).get("warning")
386
- if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}")
387
- if content:
388
- logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}")
389
- return content.strip()
390
- else:
391
- logger.warning(f"[Web Scraper - URLToText API] API returned success but content was empty for {url}. Response: {data}")
392
- return None
393
- except json.JSONDecodeError:
394
- logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Response: {response.text[:500]}...")
395
- return None
396
- except Exception as e:
397
- logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True)
398
- return None
399
- elif response.status_code in [400, 402, 422, 500]: # Known client/server errors
400
- logger.error(f"[Web Scraper - URLToText API] Error {response.status_code} from API for {url}. Response: {response.text[:200]}...")
401
- return None
402
- else: # Other unexpected codes
403
- logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}...")
404
- return None
405
- except requests.exceptions.Timeout:
406
- logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}")
407
- return None
408
- except requests.exceptions.RequestException as e:
409
- logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}")
410
- return None
411
- except Exception as e:
412
- logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True)
413
- return None
414
-
415
- # DeepSeek Summary Function (via OpenRouter)
416
- async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
417
- """Generates summary using DeepSeek via OpenRouter API."""
418
- logger.info(f"Generating {summary_type} summary using DeepSeek/OpenRouter. Input text length: {len(text)}")
419
- if not api_key:
420
- logger.error("OpenRouter API key was not provided to generate_summary.")
421
- return "Error: AI model configuration key (OpenRouter) is missing."
422
-
423
- openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
424
- # Check OpenRouter docs for the latest recommended free/low-cost models
425
- model_name = "deepseek/deepseek-chat:free"
426
-
427
- if summary_type == "paragraph":
428
- prompt = "You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be: • Clear and simple language suitable for someone unfamiliar with the topic. • Uses British English spellings throughout. • Straightforward and understandable vocabulary; avoid complex terms. • Presented as ONE SINGLE PARAGRAPH. • No more than 85 words maximum; but does not have to be exactly 85. • Considers the entire text content equally. • Uses semicolons (;) instead of em dashes (– or —). Here is the text to summarise:"
429
- else: # points summary
430
- prompt = """You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this Markdown format:
431
- For each distinct topic or section identified in the text, create a heading.
432
- • Each heading MUST be enclosed in double asterisks for bolding (e.g., **Section Title**).
433
- Immediately following each heading, list the key points as a bulleted list.
434
- Each bullet point MUST start with a hyphen and a space (`- `) on a new line.
435
- The text within each bullet point should NOT contain any bold formatting.
436
- Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.
437
- Use British English spellings throughout.
438
- Avoid overly complex or advanced vocabulary.
439
- Keep bullet points concise.
440
- • Ensure the entire summary takes no more than two minutes to read.
441
- Consider the entire text's content, not just the beginning or a few topics.
442
- Use semicolons (;) instead of em dashes (– or ���).
443
-
444
- Here is the text to summarise:"""
445
-
446
- MAX_INPUT_LENGTH = 500000 # Truncate long inputs to avoid high costs/errors
447
- if len(text) > MAX_INPUT_LENGTH:
448
- logger.warning(f"Input text length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating.")
449
- text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
450
- full_prompt = f"{prompt}\n\n{text}"
451
-
452
- headers = {
453
- "Authorization": f"Bearer {api_key}",
454
- "Content-Type": "application/json",
455
- # Recommended headers for OpenRouter identification
456
- "HTTP-Referer": "https://replit.com/", # Identify source
457
- "X-Title": "Telegram Summary Bot (Replit)", # Identify app
458
- }
459
- payload = json.dumps({
460
- "model": model_name,
461
- "messages": [
462
- {"role": "user", "content": full_prompt}
463
- ],
464
- # Optional: Add max_tokens if needed, check model defaults
465
- # "max_tokens": 1024,
466
- })
467
-
468
- try:
469
- logger.debug(f"Sending request to OpenRouter ({model_name})...")
470
- # Run blocking request in thread
471
- response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=60)
472
- logger.debug(f"Received status code {response.status_code} from OpenRouter.")
473
-
474
- if response.status_code == 200:
475
- try:
476
- data = response.json()
477
- if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
478
- message = data["choices"][0].get("message")
479
- if message and isinstance(message, dict):
480
- summary = message.get("content")
481
- if summary:
482
- logger.info(f"Successfully generated summary via OpenRouter. Output length: {len(summary)}")
483
- return summary.strip()
484
- else:
485
- logger.warning(f"OpenRouter response successful, but content was empty. Response: {data}")
486
- return "Sorry, the AI model returned an empty summary."
487
- else:
488
- logger.error(f"Unexpected message structure in OpenRouter response: {message}. Full response: {data}")
489
- return "Sorry, could not parse the AI model's response (unexpected format)."
490
- else:
491
- # Handle cases like moderation flags, empty choices list
492
- if data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}")
493
- else: logger.error(f"Unexpected choices structure in OpenRouter response: {data.get('choices')}. Full response: {data}")
494
- return "Sorry, could not parse the AI model's response (choices missing/invalid or API error)."
495
-
496
- except json.JSONDecodeError:
497
- logger.error(f"Failed to decode JSON response from OpenRouter. Status: {response.status_code}. Response text: {response.text[:500]}...")
498
- return "Sorry, failed to understand the response from the AI model."
499
- except Exception as e:
500
- logger.error(f"Error processing successful OpenRouter response: {e}", exc_info=True)
501
- return "Sorry, an error occurred while processing the AI model's response."
502
-
503
- elif response.status_code == 401:
504
- logger.error("OpenRouter API key is invalid (401 Unauthorized). Check Replit Secrets.")
505
- return "Error: The AI model configuration key (OpenRouter) is invalid."
506
- elif response.status_code == 402:
507
- logger.error("OpenRouter Payment Required (402). Check credits/limits on OpenRouter.")
508
- return "Sorry, there might be an issue with the AI model service limits or payment. Please try again later or check OpenRouter account."
509
- elif response.status_code == 429:
510
- logger.warning("OpenRouter Rate Limit Exceeded (429).")
511
- return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
512
- elif response.status_code == 500:
513
- logger.error(f"OpenRouter Internal Server Error (500). Response: {response.text[:500]}...")
514
- return "Sorry, the AI model service encountered an internal error. Please try again later."
515
- else:
516
- # Handle other potential errors (e.g., 400 Bad Request, 404 Not Found for model)
517
- logger.error(f"Unexpected status code {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
518
- try: # Try to parse error message from response body
519
- error_data = response.json()
520
- error_msg = error_data.get("error", {}).get("message", response.text[:100])
521
- return f"Sorry, the AI model service returned an error ({response.status_code}): {error_msg}"
522
- except: # Fallback if parsing fails
523
- return f"Sorry, the AI model service returned an unexpected status ({response.status_code})."
524
-
525
- except requests.exceptions.Timeout:
526
- logger.error("Timeout error connecting to OpenRouter API.")
527
- return "Sorry, the request to the AI model timed out. Please try again."
528
- except requests.exceptions.RequestException as e:
529
- logger.error(f"Request error connecting to OpenRouter API: {e}")
530
- return "Sorry, there was an error connecting to the AI model service."
531
- except Exception as e:
532
- logger.error(f"Unexpected error in generate_summary (OpenRouter): {e}", exc_info=True)
533
- return "Sorry, an unexpected error occurred while generating the summary."
534
-
535
-
536
- # --- Telegram Bot Handlers (Command, Message, CallbackQuery) ---
537
-
538
- async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
539
- """Sends a welcome message when the /start command is issued."""
540
- user = update.effective_user
541
- logger.info(f"User {user.id} ({user.username or 'NoUsername'}) used /start.")
542
- mention = user.mention_html() if user.username else user.first_name
543
- await update.message.reply_html(
544
- f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\n"
545
- "Just send me a link anytime!",
546
- # Optional: disable_web_page_preview=True
547
- )
548
-
549
- async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
550
- """Sends a help message when the /help command is issued."""
551
- logger.info(f"User {update.effective_user.id} used /help.")
552
- await update.message.reply_text(
553
- "🔍 **How to use this bot:**\n\n"
554
- "1. Send me any YouTube video link or website URL.\n"
555
- "2. I'll ask how you want it summarized (paragraph or points).\n"
556
- "3. Click the button for your choice.\n"
557
- "4. Wait for the summary!\n\n"
558
- "I use multiple methods if the first fails (especially for YT transcripts & website content).\n\n"
559
- "**Commands:**\n"
560
- "/start - Display welcome message\n"
561
- "/help - Show this help message",
562
- parse_mode=ParseMode.MARKDOWN
563
- )
564
-
565
- async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
566
- """Handles text messages, checks for URLs, and asks for summary type."""
567
- if not update.message or not update.message.text: return # Ignore empty messages
568
- url = update.message.text.strip()
569
- user = update.effective_user
570
- logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}")
571
-
572
- # Basic URL validation
573
- if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
574
- logger.debug(f"Ignoring non-URL message from user {user.id}: {url}")
575
- # Optional: Reply if you want to guide the user
576
- # await update.message.reply_text("Please send a valid URL starting with http:// or https://")
577
- return
578
-
579
- # Store URL in user_data (simple state management)
580
- context.user_data['url_to_summarize'] = url
581
- logger.debug(f"Stored URL '{url}' for user {user.id} in user_data")
582
-
583
- # Ask for summary type with Inline Keyboard
584
- keyboard = [
585
- [
586
- InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"),
587
- InlineKeyboardButton("Points Summary", callback_data="points")
588
- ]
589
- ]
590
- reply_markup = InlineKeyboardMarkup(keyboard)
591
- await update.message.reply_text(
592
- f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?",
593
- reply_markup=reply_markup,
594
- disable_web_page_preview=True
595
- )
596
-
597
- async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
598
- """Handles button presses for summary type selection."""
599
- query = update.callback_query
600
- if not query: return
601
- await query.answer() # Acknowledge button press immediately
602
-
603
- summary_type = query.data
604
- user = update.effective_user or query.from_user # Get user info
605
- url = context.user_data.get('url_to_summarize', None) # Retrieve stored URL
606
-
607
- logger.info(f"User {user.id} chose '{summary_type}' summary. Checking for URL '{url}' in context.")
608
-
609
- # Check if URL is still in context (it might expire or be lost)
610
- if not url:
611
- logger.warning(f"User {user.id} pressed button, but NO URL found in user_data context.")
612
- try:
613
- # Edit the message where the button was, informing the user
614
- await query.edit_message_text(text="Sorry, I couldn't find the URL associated with this request. Please send the link again.")
615
- except Exception as edit_err:
616
- # If editing fails (e.g., message too old), log it but don't crash
617
- logger.warning(f"Failed to edit message for missing URL context: {edit_err}")
618
- # Maybe send a new message as a fallback? Depends on desired behavior.
619
- # await context.bot.send_message(chat_id=user.id, text="Sorry, context lost. Please send link again.")
620
- return # Stop processing if URL is missing
621
-
622
- # Clear the URL from context now that we're processing it
623
- context.user_data.pop('url_to_summarize', None)
624
- logger.debug(f"Retrieved and cleared URL {url} from user_data for user {user.id}")
625
-
626
- # --- Get API Keys (Read fresh from environment - cheap operation) ---
627
- # This ensures if secrets are updated in Replit UI, the next request uses them
628
- current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
629
- current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
630
- current_supadata_key = os.environ.get('SUPADATA_API_KEY')
631
- current_apify_token = os.environ.get('APIFY_API_TOKEN')
632
-
633
- # Check *essential* key for summarization
634
- if not current_openrouter_key:
635
- logger.error("OpenRouter API key missing in handler. Cannot generate summary.")
636
- # Inform user and clean up the button message
637
- await context.bot.send_message(chat_id=user.id, text="Error: AI model configuration key (OpenRouter) is missing. Cannot generate summary.")
638
- try: await query.delete_message() # Delete the message with buttons
639
- except Exception: pass
640
- return
641
-
642
- # --- Start Processing ---
643
- processing_message = f"Got it! Generating '{summary_type}' summary for:\n{url}\n\nThis might take a moment..."
644
- message_to_delete_later = None # In case editing fails
645
- try:
646
- # Edit the message to show processing status
647
- await query.edit_message_text(processing_message)
648
- except Exception as e:
649
- # If editing fails (e.g., message too old), send a new status message
650
- logger.warning(f"Could not edit original message: {e}, sending new status message.")
651
- try:
652
- message_to_delete_later = await context.bot.send_message(chat_id=user.id, text=processing_message)
653
- except Exception as send_err:
654
- # If even sending fails, log and give up on this request
655
- logger.error(f"Failed to send status message after edit failure: {send_err}")
656
- return
657
-
658
- content = None
659
- user_feedback_message = None # Stores error messages for the user
660
- success = False
661
- is_youtube = is_youtube_url(url)
662
-
663
- try:
664
- # Show "typing..." status in Telegram chat
665
- await context.bot.send_chat_action(chat_id=user.id, action='typing')
666
-
667
- # --- Content Fetching Logic ---
668
- if is_youtube:
669
- video_id = extract_youtube_id(url)
670
- if video_id:
671
- # Fetch YT transcript using the function with fallbacks
672
- content = await get_youtube_transcript(
673
- video_id,
674
- url, # Pass full URL for Apify
675
- current_supadata_key,
676
- current_apify_token
677
- )
678
- # Set feedback message only if content fetching failed
679
- user_feedback_message = None if content else "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
680
- else:
681
- user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
682
- else: # Website Logic (Requests/BS4 -> URLToText API)
683
- logger.info(f"Attempting website scrape (Requests/BS4) for {url}")
684
- content = await get_website_content_via_requests(url)
685
-
686
- if content:
687
- logger.info("Primary website scraping (Requests/BS4) successful.")
688
- user_feedback_message = None
689
- else:
690
- logger.warning(f"Primary web scraping failed for {url}. Attempting fallback API (URLToText).")
691
- if current_urltotext_key:
692
- await context.bot.send_chat_action(chat_id=user.id, action='typing') # Show activity for fallback
693
- content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
694
- if content:
695
- user_feedback_message = None
696
- logger.info("Fallback URLToText API scraping successful.")
697
- else:
698
- user_feedback_message = "Sorry, I couldn't fetch content from that website using either method (blocked/inaccessible/empty?)."
699
- logger.error(f"Both primary (Requests/BS4) and fallback API failed for website {url}.")
700
- else:
701
- # Primary failed, and fallback key is missing
702
- user_feedback_message = "Sorry, I couldn't fetch content from that website (blocked/inaccessible/empty?). The fallback method is not configured."
703
- logger.warning(f"Primary web scraping failed for {url}, and fallback API key (URLToText) is missing.")
704
- # --- End Content Fetching ---
705
-
706
- # --- Generate Summary if Content was Fetched ---
707
- if content:
708
- logger.info("Content fetched successfully, proceeding to generate summary.")
709
- await context.bot.send_chat_action(chat_id=user.id, action='typing')
710
- # Pass the OpenRouter key to the summary function
711
- summary = await generate_summary(content, summary_type, current_openrouter_key)
712
-
713
- # Check if summary generation returned an error message
714
- if summary.startswith("Error:") or summary.startswith("Sorry,"):
715
- user_feedback_message = summary # Use the error from the summary function
716
- success = False
717
- logger.warning(f"Summary generation failed or returned error: {summary}")
718
- else:
719
- # Send the successful summary
720
- await context.bot.send_message(
721
- chat_id=user.id,
722
- text=summary,
723
- parse_mode=ParseMode.MARKDOWN,
724
- disable_web_page_preview=True
725
- )
726
- success = True
727
- user_feedback_message = None # Clear any previous failure message from fetching stage
728
- elif not user_feedback_message:
729
- # If content is None but no specific error message was set during fetching
730
- user_feedback_message = "Sorry, couldn't retrieve any content to summarize from the provided link."
731
- logger.warning(f"Content fetching resulted in None for {url}, but no specific user feedback message was set.")
732
-
733
- # --- Send Feedback if any step failed ---
734
- if user_feedback_message and not success:
735
- await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
736
-
737
- except Exception as e:
738
- # Catch unexpected errors during the whole process
739
- logger.error(f"Unexpected error during processing callback for {url}: {e}", exc_info=True)
740
- try:
741
- # Send a generic error message to the user
742
- await context.bot.send_message(chat_id=user.id, text="Oops! Something went really wrong while processing your request. Please try again later.")
743
- except Exception as final_err:
744
- # If even sending the error message fails... log it.
745
- logger.error(f"Failed to send final error message to user {user.id}: {final_err}")
746
- finally:
747
- # --- Cleanup ---
748
- # Delete the "Processing..." status message or the original message with buttons
749
- try:
750
- if message_to_delete_later: # If we sent a separate status message
751
- await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later.message_id)
752
- elif query: # Otherwise, delete the original message with the buttons
753
- # We might have already edited it, but deleting ensures cleanup
754
- await query.delete_message()
755
- except Exception as del_e:
756
- # Log if deletion fails, but don't let it stop anything
757
- logger.warning(f"Could not delete status/button message: {del_e}")
758
-
759
-
760
- async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
761
- """Log Errors caused by Updates."""
762
- logger.error("Exception while handling an update:", exc_info=context.error)
763
- # Optionally, notify the developer or user about the error
764
- # developer_chat_id = 12345678 # Replace with your chat ID if desired
765
- # if update and hasattr(update, 'effective_chat') and update.effective_chat:
766
- # chat_id = update.effective_chat.id
767
- # error_message = f"Error processing update {update.update_id} for chat {chat_id}. Error: {context.error}"
768
- # else:
769
- # error_message = f"Error in handler: {context.error}"
770
- # try:
771
- # await context.bot.send_message(chat_id=developer_chat_id, text=error_message)
772
- # except Exception as e:
773
- # logger.error(f"Failed to send error notification to developer: {e}")
774
-
775
-
776
- # --- Initialize Telegram Bot Application ---
777
- # Do this setup once when the script starts
778
- logger.info("Initializing Telegram Application...")
779
- if not TELEGRAM_TOKEN: # Check again, as initialization needs it
780
- logger.critical("Cannot initialize PTB Application without TELEGRAM_TOKEN.")
781
- # Consider how to handle this in deployment - maybe the Flask app still runs but logs errors?
782
- ptb_app = None
783
- else:
784
- ptb_app = Application.builder().token(TELEGRAM_TOKEN).build()
785
-
786
- # Register handlers with the PTB application instance
787
- ptb_app.add_handler(CommandHandler("start", start))
788
- ptb_app.add_handler(CommandHandler("help", help_command))
789
- ptb_app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
790
- ptb_app.add_handler(CallbackQueryHandler(handle_summary_type_callback))
791
- # Add the error handler
792
- ptb_app.add_error_handler(error_handler)
793
- logger.info("Telegram handlers registered.")
794
-
795
-
796
- # --- Flask App Setup ---
797
- app = Flask(__name__) # Create Flask web server instance
798
- logger.info("Flask app created.")
799
-
800
-
801
- # --- Webhook Endpoint ---
802
- @app.route('/webhook', methods=['POST'])
803
- async def webhook() -> Response:
804
- """Webhook endpoint to receive updates from Telegram"""
805
- logger.info("Webhook received request...")
806
- if not ptb_app: # Check if PTB initialization failed
807
- logger.error("Telegram application not initialized. Cannot process update.")
808
- return Response('Bot not configured', status=500)
809
-
810
- if request.is_json:
811
- try:
812
- update_data = request.get_json()
813
- # Use PTB's built-in deserialization
814
- update = Update.de_json(update_data, ptb_app.bot)
815
- logger.debug(f"Processing update ID: {update.update_id}")
816
-
817
- # Process the update using PTB's internal dispatcher
818
- # This ensures all handlers, context, etc., work as expected
819
- # Running it in a background task allows Flask to respond quickly
820
- # If Replit handles concurrency well, this might not be strictly needed,
821
- # but it's safer practice for potentially long-running handlers.
822
- asyncio.create_task(ptb_app.process_update(update))
823
-
824
- # Respond quickly to Telegram that we received the update
825
- return Response('ok', status=200)
826
- except json.JSONDecodeError:
827
- logger.error("Failed to decode JSON from Telegram webhook.")
828
- return Response('Bad Request - Invalid JSON', status=400)
829
- except Exception as e:
830
- logger.error(f"Error processing update in webhook handler: {e}", exc_info=True)
831
- return Response('Internal Server Error', status=500)
832
- else:
833
- logger.warning("Received non-JSON request to webhook endpoint.")
834
- return Response('Bad Request - Expected JSON', status=400)
835
-
836
- @app.route('/')
837
- def index():
838
- """A simple health check endpoint for the web server"""
839
- logger.debug("Health check endpoint '/' accessed.")
840
- # You could add more checks here if needed
841
- bot_status = "PTB App Initialized" if ptb_app else "PTB App FAILED Initialization"
842
- return f"Hello! Telegram Bot Webhook Listener ({bot_status}) is running."
843
-
844
-
845
- # --- Main Execution Block ---
846
- # This part runs the Flask web server when you execute `python main.py`
847
- if __name__ == '__main__':
848
- if not ptb_app:
849
- logger.critical("Aborting Flask server start because Telegram Application failed to initialize (Missing Token?).")
850
- else:
851
- logger.info("Starting Flask web server for Telegram bot webhook...")
852
- # Replit expects apps to listen on 0.0.0.0 and often injects the PORT environment variable
853
- # Defaulting to 8080 if PORT isn't set by Replit.
854
- port = int(os.environ.get('PORT', 8080))
855
- # Use app.run for development/simple deployment.
856
- # For production, tools like Gunicorn or Uvicorn are often used,
857
- # but Replit's runner usually handles this adequately.
858
- app.run(host='0.0.0.0', port=port, debug=False) # Set debug=False for deployment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py (Revised for Hugging Face - Corrected Logs & Added Debugging)
2
+ import os
3
+ import re
4
+ import logging
5
+ import asyncio
6
+ import json
7
+ from flask import Flask, request, Response # For web server
8
+
9
+ from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
10
+ from telegram.ext import (
11
+ Application,
12
+ CommandHandler,
13
+ MessageHandler,
14
+ filters,
15
+ ContextTypes,
16
+ CallbackQueryHandler
17
+ )
18
+ from telegram.constants import ParseMode # Import ParseMode explicitly
19
+
20
+ # Import specific libraries (Ensure these are covered in requirements.txt)
21
+ from youtube_transcript_api import YouTubeTranscriptApi
22
+ import requests
23
+ from bs4 import BeautifulSoup
24
+ # Only import ApifyClient if you might use it (i.e., have the secret)
25
+ # Check environment variable existence *before* conditional import
26
+ _apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
27
+ if _apify_token_exists:
28
+ from apify_client import ApifyClient
29
+ else:
30
+ ApifyClient = None # Define it as None if not used, to avoid errors later
31
+
32
+ # Apply nest_asyncio early, can help prevent event loop conflicts in web frameworks
33
+ import nest_asyncio
34
+ nest_asyncio.apply()
35
+
36
+ # --- Logging Setup ---
37
+ logging.basicConfig(
38
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
39
+ level=logging.INFO # Set to INFO to see processing steps, DEBUG for finer detail
40
+ )
41
+ # Reduce noise from libraries
42
+ logging.getLogger("httpx").setLevel(logging.WARNING)
43
+ if ApifyClient: # Only set level if imported
44
+ logging.getLogger("apify_client").setLevel(logging.WARNING)
45
+ logging.getLogger("telegram.ext").setLevel(logging.INFO)
46
+ logging.getLogger('telegram.bot').setLevel(logging.INFO)
47
+ logging.getLogger("urllib3").setLevel(logging.INFO) # From requests
48
+ logging.getLogger('gunicorn.error').setLevel(logging.INFO) # Gunicorn logs
49
+ logger = logging.getLogger(__name__)
50
+ logger.info("Logging configured.")
51
+
52
+ # --- Environment Variable Loading & Debugging ---
53
+ logger.info("Attempting to load secrets from environment variables...")
54
+
55
+ def get_secret(secret_name):
56
+ """Reads secret and logs the attempt and result."""
57
+ logger.debug(f"Attempting to read secret: {secret_name}")
58
+ value = os.environ.get(secret_name)
59
+ if value:
60
+ logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})") # Log length, not value itself
61
+ else:
62
+ logger.warning(f"Secret '{secret_name}': Not Found")
63
+ return value
64
+
65
+ TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
66
+ OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY')
67
+ URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Will be None if not set in Secrets
68
+ SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # Will be None if not set in Secrets
69
+ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # Will be None if not set in Secrets
70
+
71
+ logger.info("Secret loading attempt finished.")
72
+
73
+ # --- Initial Validation ---
74
+ # Perform validation *after* attempting to load all secrets
75
+ if not TELEGRAM_TOKEN:
76
+ logger.critical("FATAL: TELEGRAM_TOKEN environment variable not found or empty!")
77
+ # In a deployed environment, exiting might not be helpful. Log critical error.
78
+ # exit("Telegram Token Missing")
79
+ else:
80
+ logger.info("TELEGRAM_TOKEN seems present.")
81
+
82
+ if not OPENROUTER_API_KEY:
83
+ # Log error but allow running, summaries will just fail gracefully later
84
+ logger.error("OPENROUTER_API_KEY not found or empty! Summarization will fail.")
85
+ else:
86
+ logger.info("OPENROUTER_API_KEY seems present.")
87
+
88
+ # (Optional checks log warnings if keys were not found by get_secret)
89
+
90
+
91
+ # --- Bot Logic Functions (Simplified Version - No Crawl4AI) ---
92
+ # (Functions: is_youtube_url, extract_youtube_id, get_transcript_via_supadata,
93
+ # get_transcript_via_apify, get_youtube_transcript, get_website_content_via_requests,
94
+ # get_website_content_via_urltotext_api, generate_summary remain EXACTLY THE SAME
95
+ # as in the previous complete main.py code block. Ensure they are included here.)
96
+
97
+ # --- [PASTE ALL BOT LOGIC FUNCTIONS HERE - FROM is_youtube_url to generate_summary ] ---
98
+ # Helper Functions
99
+ def is_youtube_url(url):
100
+ """Checks if the URL is a valid YouTube video or shorts URL."""
101
+ youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
102
+ return bool(re.search(youtube_regex, url))
103
+
104
+ def extract_youtube_id(url):
105
+ """Extracts the YouTube video ID from a URL."""
106
+ youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
107
+ match = re.search(youtube_id_regex, url)
108
+ if match:
109
+ return match.group(1)
110
+ logger.warning(f"Could not extract YouTube ID from URL: {url}")
111
+ return None
112
+
113
+ # Supadata Transcript Fetching
114
+ async def get_transcript_via_supadata(video_id: str, api_key: str):
115
+ """Fetches YouTube transcript via Supadata API."""
116
+ if not video_id: logger.error("[Supadata] get_transcript_via_supadata called with no video_id"); return None
117
+ if not api_key: logger.error("[Supadata] API key is missing."); return None # Already checked before calling
118
+ logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
119
+ api_endpoint = f"https://api.supadata.net/v1/youtube/transcript"
120
+ params = {"videoId": video_id, "format": "text"}
121
+ headers = {"X-API-Key": api_key}
122
+ try:
123
+ # Use asyncio.to_thread to run blocking requests.get in a separate thread
124
+ response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30)
125
+ logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
126
+ if response.status_code == 200:
127
+ try:
128
+ data = response.json()
129
+ content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
130
+ if content and isinstance(content, str):
131
+ logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
132
+ return content.strip()
133
+ else:
134
+ logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
135
+ return None
136
+ except json.JSONDecodeError: # Handle cases where API might return plain text on success
137
+ if response.text:
138
+ logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
139
+ return response.text.strip()
140
+ else:
141
+ logger.error(f"[Supadata] Failed to decode JSON response (and no text body) for {video_id}. Response: {response.text[:200]}...")
142
+ return None
143
+ except Exception as e:
144
+ logger.error(f"[Supadata] Error processing successful response for {video_id}: {e}", exc_info=True)
145
+ return None
146
+ elif response.status_code in [401, 403]:
147
+ logger.error(f"[Supadata] Authentication error ({response.status_code}). Check API key.")
148
+ return None # Don't retry if key is bad
149
+ elif response.status_code == 404:
150
+ logger.warning(f"[Supadata] Transcript not found ({response.status_code}) for {video_id}.")
151
+ return None
152
+ else:
153
+ logger.error(f"[Supadata] Unexpected status code {response.status_code} for {video_id}. Response: {response.text[:200]}...")
154
+ return None
155
+ except requests.exceptions.Timeout:
156
+ logger.error(f"[Supadata] Timeout error connecting to API for {video_id}")
157
+ return None
158
+ except requests.exceptions.RequestException as e:
159
+ logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
160
+ return None
161
+ except Exception as e:
162
+ logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
163
+ return None
164
+
165
+ # Apify Transcript Fetching
166
+ async def get_transcript_via_apify(video_url: str, api_token: str):
167
+ """Fetches YouTube transcript via Apify API."""
168
+ if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
169
+ if not api_token: logger.error("[Apify] API token is missing."); return None # Already checked
170
+ if not ApifyClient: logger.error("[Apify] ApifyClient not available/imported."); return None
171
+
172
+ logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
173
+ actor_id = "karamelo~youtube-transcripts"
174
+ api_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
175
+ params = {"token": api_token}
176
+ payload = json.dumps({
177
+ "urls": [video_url],
178
+ "outputFormat": "singleStringText",
179
+ "maxRetries": 5,
180
+ "channelHandleBoolean": False,
181
+ "channelNameBoolean": False,
182
+ "datePublishedBoolean": False,
183
+ "relativeDateTextBoolean": False,
184
+ })
185
+ headers = {"Content-Type": "application/json"}
186
+ try:
187
+ logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
188
+ response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90) # Longer timeout for actor run
189
+ logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
190
+ if response.status_code == 200:
191
+ try:
192
+ results = response.json()
193
+ if isinstance(results, list) and len(results) > 0:
194
+ item = results[0]
195
+ content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
196
+ if not content and item.get("captions") and isinstance(item["captions"], list): # Handle 'captions' format if primary keys fail
197
+ logger.info("[Apify] Processing 'captions' format.")
198
+ content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
199
+ if content and isinstance(content, str):
200
+ logger.info(f"[Apify] Successfully fetched transcript for {video_url}. Length: {len(content)}")
201
+ return content.strip()
202
+ else:
203
+ logger.warning(f"[Apify] Actor run successful but transcript content not found/empty in result for {video_url}. Result item: {item}")
204
+ return None
205
+ else:
206
+ logger.warning(f"[Apify] Actor run successful but dataset was empty for {video_url}. Response: {results}")
207
+ return None
208
+ except json.JSONDecodeError:
209
+ logger.error(f"[Apify] Failed to decode JSON response for {video_url}. Status: {response.status_code}. Response text: {response.text[:200]}...")
210
+ return None
211
+ except Exception as e:
212
+ logger.error(f"[Apify] Error processing successful response for {video_url}: {e}", exc_info=True)
213
+ return None
214
+ elif response.status_code == 400:
215
+ logger.error(f"[Apify] Bad Request (400) for {video_url}. Check input payload. Response: {response.text[:200]}...")
216
+ return None
217
+ elif response.status_code == 401:
218
+ logger.error("[Apify] Authentication error (401). Check API token.")
219
+ return None # Don't retry if token is bad
220
+ else:
221
+ logger.error(f"[Apify] Unexpected status code {response.status_code} for {video_url}. Response: {response.text[:200]}...")
222
+ return None
223
+ except requests.exceptions.Timeout:
224
+ logger.error(f"[Apify] Timeout error running actor for {video_url}")
225
+ return None
226
+ except requests.exceptions.RequestException as e:
227
+ logger.error(f"[Apify] Request error running actor for {video_url}: {e}")
228
+ return None
229
+ except Exception as e:
230
+ logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True)
231
+ return None
232
+
233
+ # Combined YouTube Transcript Function (with Fallbacks)
234
+ async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
235
+ """Fetches YouTube transcript using library, then Supadata, then Apify."""
236
+ if not video_id: logger.error("get_youtube_transcript called with no video_id"); return None
237
+ logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
238
+ transcript_text = None
239
+
240
+ # 1. Primary Method: youtube-transcript-api
241
+ logger.info("[Primary YT] Attempting youtube-transcript-api...")
242
+ try:
243
+ # Run synchronous library call in a thread
244
+ transcript_list = await asyncio.to_thread(
245
+ YouTubeTranscriptApi.get_transcript,
246
+ video_id,
247
+ languages=['en', 'en-GB', 'en-US'] # Prioritize English variations
248
+ )
249
+ if transcript_list:
250
+ transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
251
+ if transcript_text:
252
+ logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})")
253
+ return transcript_text # Return immediately on success
254
+ else:
255
+ logger.warning(f"[Primary YT] Joined transcript text is empty for {video_id}")
256
+ transcript_text = None # Ensure it's None if empty after join
257
+ else:
258
+ logger.warning(f"[Primary YT] Transcript list empty for {video_id}")
259
+ transcript_text = None
260
+ except Exception as e:
261
+ logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {e}")
262
+ if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found for {video_id}. May be unavailable/private.")
263
+ elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled for {video_id}.")
264
+ transcript_text = None # Ensure it's None on error
265
+
266
+ # 2. Fallback 1: Supadata API
267
+ if transcript_text is None:
268
+ logger.info("[Fallback YT 1] Primary method failed. Trying Supadata API...")
269
+ if supadata_key:
270
+ transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
271
+ if transcript_text:
272
+ logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id}")
273
+ return transcript_text # Return on success
274
+ else:
275
+ logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.")
276
+ else:
277
+ logger.warning("[Fallback YT 1] Supadata API key not available. Skipping.")
278
+
279
+ # 3. Fallback 2: Apify API
280
+ if transcript_text is None:
281
+ logger.info("[Fallback YT 2] Primary & Supadata failed. Trying Apify API...")
282
+ if apify_token:
283
+ transcript_text = await get_transcript_via_apify(video_url, apify_token)
284
+ if transcript_text:
285
+ logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url}")
286
+ return transcript_text # Return on success
287
+ else:
288
+ logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.")
289
+ else:
290
+ logger.warning("[Fallback YT 2] Apify API token not available. Skipping.")
291
+
292
+ # If all methods failed
293
+ if transcript_text is None:
294
+ logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
295
+ return None
296
+
297
+ # Should not be reached if logic above is correct, but as a safeguard
298
+ return transcript_text
299
+
300
+ # Website Content via Requests/BS4 (Primary Method for Simplified Bot)
301
+ async def get_website_content_via_requests(url):
302
+ """Attempts to scrape website content using requests/BeautifulSoup (Primary Method)."""
303
+ if not url: logger.error("[Web Scraper - Requests/BS4] called with no URL"); return None
304
+ logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}")
305
+ try:
306
+ headers = {
307
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', # Updated UA
308
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
309
+ 'Accept-Language': 'en-US,en;q=0.9',
310
+ 'Connection': 'keep-alive',
311
+ 'DNT': '1', # Do Not Track
312
+ 'Upgrade-Insecure-Requests': '1'
313
+ }
314
+ logger.debug(f"[Web Scraper - Requests/BS4] Sending request to {url}")
315
+ # Run blocking I/O in a separate thread
316
+ response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
317
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
318
+ logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}")
319
+
320
+ content_type = response.headers.get('content-type', '').lower()
321
+ if 'html' not in content_type:
322
+ logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}")
323
+ return None # Don't try to parse non-html
324
+
325
+ # Use html.parser, it's built-in
326
+ soup = BeautifulSoup(response.text, 'html.parser')
327
+
328
+ # Remove common unwanted tags more aggressively
329
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
330
+ element.extract()
331
+
332
+ # Try finding common main content containers
333
+ main_content = soup.find('main') or \
334
+ soup.find('article') or \
335
+ soup.find(id='content') or \
336
+ soup.find(class_='content') or \
337
+ soup.find(id='main-content') or \
338
+ soup.find(class_='main-content') or \
339
+ soup.find(role='main')
340
+
341
+ # Fallback to body if no specific container found
342
+ target_element = main_content if main_content else soup.body
343
+
344
+ if not target_element:
345
+ logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}")
346
+ return None # Nothing to parse
347
+
348
+ # Get text, joining lines smartly
349
+ lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
350
+ text = "\n".join(lines) # Join with newlines to preserve some structure
351
+
352
+ # Basic length check
353
+ if not text or len(text) < 50: # Arbitrary short length check
354
+ logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short or empty after cleaning for {url} (Length: {len(text)})")
355
+ # Consider returning None if too short, depends on use case
356
+ # return None
357
+
358
+ logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})")
359
+ return text
360
+
361
+ except requests.exceptions.Timeout:
362
+ logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}")
363
+ return None
364
+ except requests.exceptions.TooManyRedirects:
365
+ logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}")
366
+ return None
367
+ except requests.exceptions.RequestException as e:
368
+ # This catches ConnectTimeout, HTTPError, ConnectionError etc.
369
+ logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}")
370
+ return None
371
+ except Exception as e:
372
+ # Catch-all for unexpected errors during parsing etc.
373
+ logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True)
374
+ return None
375
+
376
+ # Website Content via URLToText API (Fallback Method)
377
+ async def get_website_content_via_urltotext_api(url: str, api_key: str):
378
+ """Fetches website content using the URLToText API (Fallback)."""
379
+ if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None
380
+ if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None # Already checked
381
+ logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}")
382
+ api_endpoint = "https://urltotext.com/api/v1/urltotext/"
383
+ payload = json.dumps({
384
+ "url": url,
385
+ "output_format": "text",
386
+ "extract_main_content": True,
387
+ "render_javascript": True, # Often needed for modern sites
388
+ "residential_proxy": False, # Start with standard
389
+ })
390
+ headers = {
391
+ "Authorization": f"Token {api_key}",
392
+ "Content-Type": "application/json"
393
+ }
394
+ try:
395
+ logger.debug(f"[Web Scraper - URLToText API] Sending request for {url}")
396
+ response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45)
397
+ logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
398
+ if response.status_code == 200:
399
+ try:
400
+ data = response.json()
401
+ content = data.get("data", {}).get("content")
402
+ credits = data.get("credits_used", "N/A")
403
+ warning = data.get("data", {}).get("warning")
404
+ if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}")
405
+ if content:
406
+ logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}")
407
+ return content.strip()
408
+ else:
409
+ logger.warning(f"[Web Scraper - URLToText API] API returned success but content was empty for {url}. Response: {data}")
410
+ return None
411
+ except json.JSONDecodeError:
412
+ logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Response: {response.text[:500]}...")
413
+ return None
414
+ except Exception as e:
415
+ logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True)
416
+ return None
417
+ elif response.status_code in [400, 402, 422, 500]: # Known client/server errors
418
+ logger.error(f"[Web Scraper - URLToText API] Error {response.status_code} from API for {url}. Response: {response.text[:200]}...")
419
+ return None
420
+ else: # Other unexpected codes
421
+ logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}...")
422
+ return None
423
+ except requests.exceptions.Timeout:
424
+ logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}")
425
+ return None
426
+ except requests.exceptions.RequestException as e:
427
+ logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}")
428
+ return None
429
+ except Exception as e:
430
+ logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True)
431
+ return None
432
+
433
+ # DeepSeek Summary Function (via OpenRouter)
434
+ async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
435
+ """Generates summary using DeepSeek via OpenRouter API."""
436
+ logger.info(f"Generating {summary_type} summary using DeepSeek/OpenRouter. Input text length: {len(text)}")
437
+ if not api_key:
438
+ logger.error("OpenRouter API key was not provided to generate_summary.")
439
+ return "Error: AI model configuration key (OpenRouter) is missing."
440
+
441
+ openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
442
+ # Check OpenRouter docs for the latest recommended free/low-cost models
443
+ model_name = "deepseek/deepseek-chat:free"
444
+
445
+ if summary_type == "paragraph":
446
+ prompt = "You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be: • Clear and simple language suitable for someone unfamiliar with the topic. • Uses British English spellings throughout. • Straightforward and understandable vocabulary; avoid complex terms. • Presented as ONE SINGLE PARAGRAPH. • No more than 85 words maximum; but does not have to be exactly 85. • Considers the entire text content equally. • Uses semicolons (;) instead of em dashes (– or —). Here is the text to summarise:"
447
+ else: # points summary
448
+ prompt = """You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this Markdown format:
449
+ For each distinct topic or section identified in the text, create a heading.
450
+ Each heading MUST be enclosed in double asterisks for bolding (e.g., **Section Title**).
451
+ • Immediately following each heading, list the key points as a bulleted list.
452
+ Each bullet point MUST start with a hyphen and a space (`- `) on a new line.
453
+ The text within each bullet point should NOT contain any bold formatting.
454
+ Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.
455
+ Use British English spellings throughout.
456
+ Avoid overly complex or advanced vocabulary.
457
+ Keep bullet points concise.
458
+ • Ensure the entire summary takes no more than two minutes to read.
459
+ Consider the entire text's content, not just the beginning or a few topics.
460
+ Use semicolons (;) instead of em dashes (– or —).
461
+
462
+ Here is the text to summarise:"""
463
+
464
+ MAX_INPUT_LENGTH = 500000 # Truncate long inputs to avoid high costs/errors
465
+ if len(text) > MAX_INPUT_LENGTH:
466
+ logger.warning(f"Input text length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating.")
467
+ text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
468
+ full_prompt = f"{prompt}\n\n{text}"
469
+
470
+ headers = {
471
+ "Authorization": f"Bearer {api_key}",
472
+ "Content-Type": "application/json",
473
+ # Recommended headers for OpenRouter identification
474
+ "HTTP-Referer": "https://huggingface.co/spaces/", # Identify source as HF Space
475
+ "X-Title": "Telegram Summary Bot (HF Space)", # Identify app
476
+ }
477
+ payload = json.dumps({
478
+ "model": model_name,
479
+ "messages": [
480
+ {"role": "user", "content": full_prompt}
481
+ ],
482
+ # Optional: Add max_tokens if needed, check model defaults
483
+ # "max_tokens": 1024,
484
+ })
485
+
486
+ try:
487
+ logger.debug(f"Sending request to OpenRouter ({model_name})...")
488
+ # Run blocking request in thread
489
+ response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=60)
490
+ logger.debug(f"Received status code {response.status_code} from OpenRouter.")
491
+
492
+ if response.status_code == 200:
493
+ try:
494
+ data = response.json()
495
+ if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
496
+ message = data["choices"][0].get("message")
497
+ if message and isinstance(message, dict):
498
+ summary = message.get("content")
499
+ if summary:
500
+ logger.info(f"Successfully generated summary via OpenRouter. Output length: {len(summary)}")
501
+ return summary.strip()
502
+ else:
503
+ logger.warning(f"OpenRouter response successful, but content was empty. Response: {data}")
504
+ return "Sorry, the AI model returned an empty summary."
505
+ else:
506
+ logger.error(f"Unexpected message structure in OpenRouter response: {message}. Full response: {data}")
507
+ return "Sorry, could not parse the AI model's response (unexpected format)."
508
+ else:
509
+ # Handle cases like moderation flags, empty choices list
510
+ if data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}")
511
+ else: logger.error(f"Unexpected choices structure in OpenRouter response: {data.get('choices')}. Full response: {data}")
512
+ return "Sorry, could not parse the AI model's response (choices missing/invalid or API error)."
513
+
514
+ except json.JSONDecodeError:
515
+ logger.error(f"Failed to decode JSON response from OpenRouter. Status: {response.status_code}. Response text: {response.text[:500]}...")
516
+ return "Sorry, failed to understand the response from the AI model."
517
+ except Exception as e:
518
+ logger.error(f"Error processing successful OpenRouter response: {e}", exc_info=True)
519
+ return "Sorry, an error occurred while processing the AI model's response."
520
+
521
+ elif response.status_code == 401:
522
+ logger.error("OpenRouter API key is invalid (401 Unauthorized). Check HF Space Secrets.")
523
+ return "Error: The AI model configuration key (OpenRouter) is invalid."
524
+ elif response.status_code == 402:
525
+ logger.error("OpenRouter Payment Required (402). Check credits/limits on OpenRouter.")
526
+ return "Sorry, there might be an issue with the AI model service limits or payment. Please try again later or check OpenRouter account."
527
+ elif response.status_code == 429:
528
+ logger.warning("OpenRouter Rate Limit Exceeded (429).")
529
+ return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
530
+ elif response.status_code == 500:
531
+ logger.error(f"OpenRouter Internal Server Error (500). Response: {response.text[:500]}...")
532
+ return "Sorry, the AI model service encountered an internal error. Please try again later."
533
+ else:
534
+ # Handle other potential errors (e.g., 400 Bad Request, 404 Not Found for model)
535
+ logger.error(f"Unexpected status code {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
536
+ try: # Try to parse error message from response body
537
+ error_data = response.json()
538
+ error_msg = error_data.get("error", {}).get("message", response.text[:100])
539
+ return f"Sorry, the AI model service returned an error ({response.status_code}): {error_msg}"
540
+ except: # Fallback if parsing fails
541
+ return f"Sorry, the AI model service returned an unexpected status ({response.status_code})."
542
+
543
+ except requests.exceptions.Timeout:
544
+ logger.error("Timeout error connecting to OpenRouter API.")
545
+ return "Sorry, the request to the AI model timed out. Please try again."
546
+ except requests.exceptions.RequestException as e:
547
+ logger.error(f"Request error connecting to OpenRouter API: {e}")
548
+ return "Sorry, there was an error connecting to the AI model service."
549
+ except Exception as e:
550
+ logger.error(f"Unexpected error in generate_summary (OpenRouter): {e}", exc_info=True)
551
+ return "Sorry, an unexpected error occurred while generating the summary."
552
+
553
+
554
+
555
+ # --- Telegram Bot Handlers (Command, Message, CallbackQuery) ---
556
+
557
+ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
558
+ """Sends a welcome message when the /start command is issued."""
559
+ user = update.effective_user
560
+ logger.info(f"User {user.id} ({user.username or 'NoUsername'}) used /start.")
561
+ mention = user.mention_html() if user.username else user.first_name
562
+ await update.message.reply_html(
563
+ f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\n"
564
+ "Just send me a link anytime!",
565
+ # Optional: disable_web_page_preview=True
566
+ )
567
+
568
+ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
569
+ """Sends a help message when the /help command is issued."""
570
+ logger.info(f"User {update.effective_user.id} used /help.")
571
+ await update.message.reply_text(
572
+ "🔍 **How to use this bot:**\n\n"
573
+ "1. Send me any YouTube video link or website URL.\n"
574
+ "2. I'll ask how you want it summarized (paragraph or points).\n"
575
+ "3. Click the button for your choice.\n"
576
+ "4. Wait for the summary!\n\n"
577
+ "I use multiple methods if the first fails (especially for YT transcripts & website content).\n\n"
578
+ "**Commands:**\n"
579
+ "/start - Display welcome message\n"
580
+ "/help - Show this help message",
581
+ parse_mode=ParseMode.MARKDOWN
582
+ )
583
+
584
+ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
585
+ """Handles text messages, checks for URLs, and asks for summary type."""
586
+ if not update.message or not update.message.text: return # Ignore empty messages
587
+ url = update.message.text.strip()
588
+ user = update.effective_user
589
+ logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}")
590
+
591
+ # Basic URL validation
592
+ if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
593
+ logger.debug(f"Ignoring non-URL message from user {user.id}: {url}")
594
+ # Optional: Reply if you want to guide the user
595
+ # await update.message.reply_text("Please send a valid URL starting with http:// or https://")
596
+ return
597
+
598
+ # Store URL in user_data (simple state management)
599
+ context.user_data['url_to_summarize'] = url
600
+ logger.debug(f"Stored URL '{url}' for user {user.id} in user_data")
601
+
602
+ # Ask for summary type with Inline Keyboard
603
+ keyboard = [
604
+ [
605
+ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"),
606
+ InlineKeyboardButton("Points Summary", callback_data="points")
607
+ ]
608
+ ]
609
+ reply_markup = InlineKeyboardMarkup(keyboard)
610
+ await update.message.reply_text(
611
+ f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?",
612
+ reply_markup=reply_markup,
613
+ disable_web_page_preview=True
614
+ )
615
+
616
+ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
617
+ """Handles button presses for summary type selection."""
618
+ query = update.callback_query
619
+ if not query: return
620
+ await query.answer() # Acknowledge button press immediately
621
+
622
+ summary_type = query.data
623
+ user = update.effective_user or query.from_user # Get user info
624
+ url = context.user_data.get('url_to_summarize', None) # Retrieve stored URL
625
+
626
+ logger.info(f"User {user.id} chose '{summary_type}' summary. Checking for URL '{url}' in context.")
627
+
628
+ # Check if URL is still in context (it might expire or be lost)
629
+ if not url:
630
+ logger.warning(f"User {user.id} pressed button, but NO URL found in user_data context.")
631
+ try:
632
+ # Edit the message where the button was, informing the user
633
+ await query.edit_message_text(text="Sorry, I couldn't find the URL associated with this request. Please send the link again.")
634
+ except Exception as edit_err:
635
+ # If editing fails (e.g., message too old), log it but don't crash
636
+ logger.warning(f"Failed to edit message for missing URL context: {edit_err}")
637
+ # Maybe send a new message as a fallback? Depends on desired behavior.
638
+ # await context.bot.send_message(chat_id=user.id, text="Sorry, context lost. Please send link again.")
639
+ return # Stop processing if URL is missing
640
+
641
+ # Clear the URL from context now that we're processing it
642
+ context.user_data.pop('url_to_summarize', None)
643
+ logger.debug(f"Retrieved and cleared URL {url} from user_data for user {user.id}")
644
+
645
+ # --- Get API Keys (Read fresh from environment - cheap operation) ---
646
+ # This ensures if secrets are updated in HF UI, the next request uses them
647
+ logger.debug("Reading API keys from environment variables within handler...")
648
+ current_openrouter_key = os.environ.get('OPENROUTER_API_KEY')
649
+ current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY')
650
+ current_supadata_key = os.environ.get('SUPADATA_API_KEY')
651
+ current_apify_token = os.environ.get('APIFY_API_TOKEN')
652
+ logger.debug(f"Keys read: OpenRouter={'Yes' if current_openrouter_key else 'No'}, URLToText={'Yes' if current_urltotext_key else 'No'}, Supadata={'Yes' if current_supadata_key else 'No'}, Apify={'Yes' if current_apify_token else 'No'}")
653
+
654
+
655
+ # Check *essential* key for summarization
656
+ if not current_openrouter_key:
657
+ logger.error("OpenRouter API key missing in handler. Cannot generate summary.")
658
+ # Inform user and clean up the button message
659
+ await context.bot.send_message(chat_id=user.id, text="Error: AI model configuration key (OpenRouter) is missing. Cannot generate summary.")
660
+ try: await query.delete_message() # Delete the message with buttons
661
+ except Exception: pass
662
+ return
663
+
664
+ # --- Start Processing ---
665
+ processing_message = f"Got it! Generating '{summary_type}' summary for:\n{url}\n\nThis might take a moment..."
666
+ message_to_delete_later = None # In case editing fails
667
+ try:
668
+ # Edit the message to show processing status
669
+ await query.edit_message_text(processing_message)
670
+ except Exception as e:
671
+ # If editing fails (e.g., message too old), send a new status message
672
+ logger.warning(f"Could not edit original message: {e}, sending new status message.")
673
+ try:
674
+ message_to_delete_later = await context.bot.send_message(chat_id=user.id, text=processing_message)
675
+ except Exception as send_err:
676
+ # If even sending fails, log and give up on this request
677
+ logger.error(f"Failed to send status message after edit failure: {send_err}")
678
+ return
679
+
680
+ content = None
681
+ user_feedback_message = None # Stores error messages for the user
682
+ success = False
683
+ is_youtube = is_youtube_url(url)
684
+
685
+ try:
686
+ # Show "typing..." status in Telegram chat
687
+ await context.bot.send_chat_action(chat_id=user.id, action='typing')
688
+
689
+ # --- Content Fetching Logic ---
690
+ if is_youtube:
691
+ video_id = extract_youtube_id(url)
692
+ if video_id:
693
+ # Fetch YT transcript using the function with fallbacks
694
+ content = await get_youtube_transcript(
695
+ video_id,
696
+ url, # Pass full URL for Apify
697
+ current_supadata_key,
698
+ current_apify_token
699
+ )
700
+ # Set feedback message only if content fetching failed
701
+ user_feedback_message = None if content else "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
702
+ else:
703
+ user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
704
+ else: # Website Logic (Requests/BS4 -> URLToText API)
705
+ logger.info(f"Attempting website scrape (Requests/BS4) for {url}")
706
+ content = await get_website_content_via_requests(url)
707
+
708
+ if content:
709
+ logger.info("Primary website scraping (Requests/BS4) successful.")
710
+ user_feedback_message = None
711
+ else:
712
+ logger.warning(f"Primary web scraping failed for {url}. Attempting fallback API (URLToText).")
713
+ if current_urltotext_key:
714
+ await context.bot.send_chat_action(chat_id=user.id, action='typing') # Show activity for fallback
715
+ content = await get_website_content_via_urltotext_api(url, current_urltotext_key)
716
+ if content:
717
+ user_feedback_message = None
718
+ logger.info("Fallback URLToText API scraping successful.")
719
+ else:
720
+ user_feedback_message = "Sorry, I couldn't fetch content from that website using either method (blocked/inaccessible/empty?)."
721
+ logger.error(f"Both primary (Requests/BS4) and fallback API failed for website {url}.")
722
+ else:
723
+ # Primary failed, and fallback key is missing
724
+ user_feedback_message = "Sorry, I couldn't fetch content from that website (blocked/inaccessible/empty?). The fallback method is not configured."
725
+ logger.warning(f"Primary web scraping failed for {url}, and fallback API key (URLToText) is missing.")
726
+ # --- End Content Fetching ---
727
+
728
+ # --- Generate Summary if Content was Fetched ---
729
+ if content:
730
+ logger.info("Content fetched successfully, proceeding to generate summary.")
731
+ await context.bot.send_chat_action(chat_id=user.id, action='typing')
732
+ # Pass the OpenRouter key to the summary function
733
+ summary = await generate_summary(content, summary_type, current_openrouter_key)
734
+
735
+ # Check if summary generation returned an error message
736
+ if summary.startswith("Error:") or summary.startswith("Sorry,"):
737
+ user_feedback_message = summary # Use the error from the summary function
738
+ success = False
739
+ logger.warning(f"Summary generation failed or returned error: {summary}")
740
+ else:
741
+ # Send the successful summary
742
+ await context.bot.send_message(
743
+ chat_id=user.id,
744
+ text=summary,
745
+ parse_mode=ParseMode.MARKDOWN,
746
+ disable_web_page_preview=True
747
+ )
748
+ success = True
749
+ user_feedback_message = None # Clear any previous failure message from fetching stage
750
+ elif not user_feedback_message:
751
+ # If content is None but no specific error message was set during fetching
752
+ user_feedback_message = "Sorry, couldn't retrieve any content to summarize from the provided link."
753
+ logger.warning(f"Content fetching resulted in None for {url}, but no specific user feedback message was set.")
754
+
755
+ # --- Send Feedback if any step failed ---
756
+ if user_feedback_message and not success:
757
+ await context.bot.send_message(chat_id=user.id, text=user_feedback_message)
758
+
759
+ except Exception as e:
760
+ # Catch unexpected errors during the whole process
761
+ logger.error(f"Unexpected error during processing callback for {url}: {e}", exc_info=True)
762
+ try:
763
+ # Send a generic error message to the user
764
+ await context.bot.send_message(chat_id=user.id, text="Oops! Something went really wrong while processing your request. Please try again later.")
765
+ except Exception as final_err:
766
+ # If even sending the error message fails... log it.
767
+ logger.error(f"Failed to send final error message to user {user.id}: {final_err}")
768
+ finally:
769
+ # --- Cleanup ---
770
+ # Delete the "Processing..." status message or the original message with buttons
771
+ try:
772
+ if message_to_delete_later: # If we sent a separate status message
773
+ await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later.message_id)
774
+ elif query: # Otherwise, delete the original message with the buttons
775
+ # We might have already edited it, but deleting ensures cleanup
776
+ await query.delete_message()
777
+ except Exception as del_e:
778
+ # Log if deletion fails, but don't let it stop anything
779
+ logger.warning(f"Could not delete status/button message: {del_e}")
780
+
781
+
782
+ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
783
+ """Log Errors caused by Updates."""
784
+ logger.error("Exception while handling an update:", exc_info=context.error)
785
+ # Optionally, notify the developer or user about the error
786
+ # developer_chat_id = 12345678 # Replace with your chat ID if desired
787
+ # if update and hasattr(update, 'effective_chat') and update.effective_chat:
788
+ # chat_id = update.effective_chat.id
789
+ # error_message = f"Error processing update {update.update_id} for chat {chat_id}. Error: {context.error}"
790
+ # else:
791
+ # error_message = f"Error in handler: {context.error}"
792
+ # try:
793
+ # await context.bot.send_message(chat_id=developer_chat_id, text=error_message)
794
+ # except Exception as e:
795
+ # logger.error(f"Failed to send error notification to developer: {e}")
796
+
797
+
798
+ # --- Initialize Telegram Bot Application ---
799
+ # Do this setup once when the script starts
800
+ logger.info("Initializing Telegram Application...")
801
+ if not TELEGRAM_TOKEN: # Check again, as initialization needs it
802
+ logger.critical("Cannot initialize PTB Application: TELEGRAM_TOKEN not found.")
803
+ ptb_app = None
804
+ else:
805
+ # Build PTB Application instance
806
+ ptb_app_builder = Application.builder().token(TELEGRAM_TOKEN)
807
+ # Removed concurrency settings for simplicity, PTB defaults should be okay with Flask + asyncio.create_task
808
+ # ptb_app_builder.concurrent_updates(True)
809
+ ptb_app = ptb_app_builder.build()
810
+
811
+ # Register handlers with the PTB application instance
812
+ ptb_app.add_handler(CommandHandler("start", start))
813
+ ptb_app.add_handler(CommandHandler("help", help_command))
814
+ ptb_app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
815
+ ptb_app.add_handler(CallbackQueryHandler(handle_summary_type_callback))
816
+ # Add the error handler
817
+ ptb_app.add_error_handler(error_handler)
818
+ logger.info("Telegram handlers registered.")
819
+
820
+
821
+ # --- Flask App Setup ---
822
+ app = Flask(__name__) # Create Flask web server instance
823
+ logger.info("Flask app created.")
824
+
825
+
826
+ # --- Webhook Endpoint ---
827
+ @app.route('/webhook', methods=['POST'])
828
+ async def webhook() -> Response:
829
+ """Webhook endpoint to receive updates from Telegram"""
830
+ logger.info("Webhook request received...")
831
+ if not ptb_app: # Check if PTB initialization failed
832
+ logger.error("Telegram application not initialized. Cannot process update.")
833
+ return Response('Bot not configured properly', status=500)
834
+
835
+ if request.is_json:
836
+ try:
837
+ update_data = request.get_json()
838
+ # Use PTB's built-in deserialization
839
+ update = Update.de_json(update_data, ptb_app.bot)
840
+ logger.debug(f"Processing update ID: {update.update_id}")
841
+
842
+ # Process the update using PTB's internal dispatcher in a background task
843
+ asyncio.create_task(ptb_app.process_update(update))
844
+
845
+ # Respond quickly to Telegram that we received the update
846
+ return Response('ok', status=200)
847
+ except json.JSONDecodeError:
848
+ logger.error("Failed to decode JSON from Telegram webhook.")
849
+ return Response('Bad Request - Invalid JSON', status=400)
850
+ except Exception as e:
851
+ logger.error(f"Error processing update in webhook handler: {e}", exc_info=True)
852
+ return Response('Internal Server Error', status=500)
853
+ else:
854
+ logger.warning("Received non-JSON request to webhook endpoint.")
855
+ return Response('Bad Request - Expected JSON', status=400)
856
+
857
+ @app.route('/')
858
+ def index():
859
+ """A simple health check endpoint for the web server"""
860
+ logger.debug("Health check endpoint '/' accessed.")
861
+ # You could add more checks here if needed (e.g., check PTB status)
862
+ bot_status = "PTB App Initialized" if ptb_app else "PTB App FAILED Initialization"
863
+ return f"Hello! Telegram Bot Webhook Listener ({bot_status}) is running."
864
+
865
+
866
+ # --- Main Execution Block ---
867
+ # This part runs the Flask web server when the Docker container starts via Gunicorn
868
+ if __name__ == '__main__':
869
+ # This block might not run when deployed via Gunicorn as specified in Dockerfile CMD
870
+ # Gunicorn imports the 'app' object directly from 'main.py'.
871
+ # However, it's good practice to have for potential local testing.
872
+ if not ptb_app:
873
+ logger.critical("Aborting Flask server start (local test?) because Telegram App failed initialization.")
874
+ else:
875
+ logger.info("Starting Flask web server directly (for local testing?)...")
876
+ # Use a development server port like 5000 for local testing
877
+ port = int(os.environ.get('PORT', 5000)) # Changed default for local test
878
+ # Run with debug=True ONLY for local testing, NEVER in production/deployment
879
+ app.run(host='0.0.0.0', port=port, debug=True)