fmab777 commited on
Commit
c83a4a7
·
verified ·
1 Parent(s): 203a74a

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +911 -305
main.py CHANGED
@@ -53,6 +53,7 @@ except ImportError:
53
  # --- Google Gemini ---
54
  try:
55
  import google.generativeai as genai
 
56
  from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
57
  _gemini_available = True
58
  except ImportError:
@@ -73,6 +74,7 @@ logging.getLogger('gunicorn.error').setLevel(logging.INFO)
73
  logging.getLogger('uvicorn').setLevel(logging.INFO)
74
  logging.getLogger('starlette').setLevel(logging.INFO)
75
  if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING)
 
76
  logging.getLogger("crawl4ai").setLevel(logging.INFO) # Adjust level as needed
77
  logging.getLogger("playwright").setLevel(logging.WARNING)
78
 
@@ -88,385 +90,906 @@ ptb_app: Optional[Application] = None
88
  # --- Environment Variable Loading & Configuration ---
89
  logger.info("Attempting to load secrets and configuration...")
90
  def get_secret(secret_name):
 
91
  value = os.environ.get(secret_name)
92
- if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)")
93
- else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}")
 
 
 
 
 
 
94
  return value
95
 
96
  TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
97
- OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY')
98
- URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
99
- SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
100
- APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
101
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
102
- GEMINI_API_KEY = get_secret('GEMINI_API_KEY')
103
 
104
- # Models
105
- OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
106
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
107
- GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001")
108
 
109
  # --- Configuration Checks ---
110
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
111
- if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found.")
112
- if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found.")
113
 
114
  _gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY)
 
 
 
115
  _openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
 
 
116
  _crawl4ai_primary_web_enabled = _crawl4ai_available
117
- _urltotext_fallback_enabled = bool(URLTOTEXT_API_KEY)
118
- _apify_token_exists = bool(APIFY_API_TOKEN)
119
 
120
- if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai missing.")
121
- elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY missing.")
122
- if not _crawl4ai_available: logger.warning("⚠️ WARNING: crawl4ai missing.")
123
 
124
- if _urltotext_fallback_enabled: logger.info("ℹ️ INFO: urltotext.com API ENABLED.")
125
- else: logger.info("ℹ️ INFO: urltotext.com API disabled.")
126
- if not SUPADATA_API_KEY: logger.info("ℹ️ INFO: Supadata API disabled.")
127
- if not APIFY_API_TOKEN: logger.info("ℹ️ INFO: Apify API disabled.")
128
- if not WEBHOOK_SECRET: logger.info("ℹ️ INFO: Webhook security disabled.")
129
 
130
  logger.info("Secret loading and configuration check finished.")
131
  logger.info(f"Primary Web Scraper: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED'}")
132
- logger.info(f"Fallback Web Scraper 1: BeautifulSoup")
133
  logger.info(f"Fallback Web Scraper 2: urltotext.com API {'ENABLED' if _urltotext_fallback_enabled else 'DISABLED'}")
134
  logger.info(f"Primary Summarizer: Gemini ({GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'})")
135
  logger.info(f"Fallback Summarizer: OpenRouter ({OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'})")
136
- logger.info(f"Primary YT Transcript: youtube-transcript-api")
137
  logger.info(f"Fallback YT Transcript 1: Supadata API {'ENABLED' if SUPADATA_API_KEY else 'DISABLED'}")
138
  logger.info(f"Fallback YT Transcript 2: Apify REST API {'ENABLED' if APIFY_API_TOKEN else 'DISABLED'}")
 
139
 
140
  if _gemini_primary_enabled:
141
- try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured.")
142
- except Exception as e: logger.error(f"Failed config Google GenAI: {e}"); _gemini_primary_enabled = False
 
 
 
 
143
 
144
  # --- Constants ---
145
  MAX_SUMMARY_CHUNK_SIZE = 4000
146
  MAX_INPUT_TOKEN_APPROX = 500000
147
 
148
  # --- Retry Decorator ---
149
- @retry( stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=15), retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True )
 
 
 
 
 
 
150
  async def retry_bot_operation(func, *args, **kwargs):
151
- try: return await func(*args, **kwargs)
 
152
  except BadRequest as e:
153
- ignore = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
154
- if any(err in str(e).lower() for err in ignore): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None
155
- logger.error(f"Potentially critical BadRequest: {e}"); raise
 
 
 
 
 
 
156
  except TelegramError as e:
157
- if isinstance(e, (TimedOut, NetworkError, RetryAfter)): logger.warning(f"TG transient error (retry): {e}")
158
- else: logger.error(f"Unhandled TG Error: {e}")
 
 
 
 
 
159
  raise
160
- except Exception as e: logger.error(f"Unexpected error in bot op: {e}", exc_info=True); raise
161
 
162
  # --- Helper Functions ---
163
  def is_youtube_url(url):
164
  youtube_regex = re.compile(r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?([\w-]{11})(?:\S+)?', re.IGNORECASE)
165
- match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match)
 
 
 
166
  def extract_youtube_id(url):
167
  youtube_regex = re.compile(r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?([\w-]{11})(?:\S+)?', re.IGNORECASE)
168
  match = youtube_regex.search(url)
169
- if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
170
- else: logger.warning(f"Could not extract YT ID from {url}"); return None
 
 
 
 
 
171
 
172
  # --- Content Fetching Functions ---
173
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
174
- if not video_id or not api_key: logger.error("[Supadata] Missing video_id or API key"); return None
175
- logger.info(f"[Supadata] Fetching: {video_id}")
176
- api = "https://api.supadata.ai/v1/youtube/transcript"; params={"v": video_id, "f": "text"}; headers={"X-API-Key": api_key} # Adjusted params slightly based on potential common patterns, check Supadata docs if needed
 
 
 
 
177
  try:
178
  async with httpx.AsyncClient(timeout=30.0) as client:
179
- response = await client.get(api, headers=headers, params=params)
180
- logger.debug(f"[Supadata] Status: {response.status_code}")
181
  if response.status_code == 200:
182
  try:
183
- data = response.json() if response.text else None; content = None
 
 
 
 
184
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
 
185
  if not content and response.text: content = response.text
186
- if content and isinstance(content, str): logger.info(f"[Supadata] Success: len {len(content)}"); return content.strip()
187
- else: logger.warning(f"[Supadata] Success but empty/invalid content. Resp:{response.text[:200]}"); return None
188
- except Exception as e: logger.error(f"[Supadata] Error processing success: {e}", exc_info=True); return None
189
- elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check key."); return None
190
- elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404)."); return None
191
- else: logger.error(f"[Supadata] Unexpected status {response.status_code}. Resp:{response.text[:200]}"); return None
192
- except Exception as e: logger.error(f"[Supadata] Unexpected error: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
195
  global APIFY_ACTOR_ID
196
- if not video_url or not api_token: logger.error("[Apify] Missing URL or token"); return None
197
- logger.info(f"[Apify] Fetching: {video_url} (Actor: {APIFY_ACTOR_ID})")
198
- api = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"; params = {"token": api_token}
199
- payload = { "urls": [video_url], "outputFormat": "singleStringText", ... }; headers = {"Content-Type": "application/json"} # Keep payload
 
 
 
 
 
 
 
 
 
 
 
 
200
  try:
201
  async with httpx.AsyncClient(timeout=120.0) as client:
202
- logger.debug(f"[Apify] POST Request...")
203
- response = await client.post(api, headers=headers, params=params, json=payload)
204
- logger.debug(f"[Apify] Status: {response.status_code}")
 
205
  if response.status_code == 200:
206
  try:
207
- results = response.json(); content = None
208
  if isinstance(results, list) and len(results) > 0:
209
- item = results[0] # Parse item logic remains
 
 
210
  if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
211
  elif "text" in item and isinstance(item["text"], str): content = item["text"]
212
- # ... other parsing options
213
- if content and isinstance(content, str): logger.info(f"[Apify] Success: len {len(content)}"); return content.strip()
214
- else: logger.warning(f"[Apify] Parsed but empty/invalid content."); return None
215
- else: logger.warning(f"[Apify] Success but empty dataset."); return None
216
- except Exception as e: logger.error(f"[Apify] Error processing success: {e}", exc_info=True); return None
217
- elif response.status_code == 400: logger.error(f"[Apify] Bad Request (400). Resp:{response.text[:200]}"); return None
218
- elif response.status_code == 401: logger.error("[Apify] Auth error (401). Check token."); return None
219
- # *** MODIFIED: Expanded elif block for 404 ***
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  elif response.status_code == 404:
221
- error_info = "Unknown 404 Error" # Default message
222
  try:
223
- # Try to get more specific error from JSON response
224
  error_data = response.json()
225
- error_info = error_data.get("error", {}).get("message", "No specific message in error object")
226
  except Exception:
227
- # If JSON parsing fails or structure is unexpected, use raw text
228
- error_info = response.text[:200] # Use beginning of response text
229
- logger.error(f"[Apify] Not Found (404). Error: '{error_info}'")
230
- return None # Return None after logging
231
- else: logger.error(f"[Apify] Unexpected status {response.status_code}. Resp:{response.text[:200]}"); return None
232
- except Exception as e: logger.error(f"[Apify] Unexpected error: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
235
  global SUPADATA_API_KEY, APIFY_API_TOKEN
236
- if not video_id: logger.error("YT transcript: No video_id"); return None
237
- logger.info(f"Fetching YT transcript for: {video_id}")
238
- transcript = None
239
- logger.info("[Primary YT] Trying youtube-transcript-api...")
240
  try:
241
- transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch )
242
- if transcript_list: transcript = " ".join([i['text'] for i in transcript_list if 'text' in i]).strip()
243
- if transcript: logger.info(f"[Primary YT] Success: len {len(transcript)}"); return transcript
244
- else: logger.warning(f"[Primary YT] Empty transcript list/text"); transcript = None
245
- except (TranscriptsDisabled, NoTranscriptFound) as e: logger.warning(f"[Primary YT] Known issue: {type(e).__name__}"); transcript = None
246
- except Exception as e: logger.warning(f"[Primary YT] Error: {e}"); transcript = None
247
-
248
- if transcript is None and SUPADATA_API_KEY:
249
- logger.info("[Fallback YT 1] Trying Supadata..."); transcript = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
250
- if transcript: logger.info("[Fallback YT 1] Success via Supadata."); return transcript
251
- else: logger.warning("[Fallback YT 1] Supadata failed.")
252
- elif transcript is None: logger.warning("[Fallback YT 1] Supadata key missing.")
253
-
254
- if transcript is None and APIFY_API_TOKEN:
255
- logger.info("[Fallback YT 2] Trying Apify..."); transcript = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
256
- if transcript: logger.info("[Fallback YT 2] Success via Apify."); return transcript
257
- else: logger.warning("[Fallback YT 2] Apify failed.")
258
- elif transcript is None: logger.warning("[Fallback YT 2] Apify token missing.")
259
-
260
- if transcript is None: logger.error(f"All YT methods failed: {video_id}"); return None
261
- return transcript # Should already be stripped if successful
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
  async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
 
264
  global _crawl4ai_primary_web_enabled
265
- if not _crawl4ai_primary_web_enabled or not url: logger.error(f"[Crawl4AI] Disabled or no URL."); return None
266
- logger.info(f"[Crawl4AI] Crawling: {url}")
 
267
  try:
 
268
  async with AsyncWebCrawler(ignore_robots=True) as crawler:
269
- logger.info(f"[Crawl4AI] Initialized (ignore_robots=True).")
270
  result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90)
 
271
  content = None
272
- if result and result.markdown: content = result.markdown.strip()
273
- elif result and result.text: content = result.text.strip()
274
- if content: logger.info(f"[Crawl4AI] Success: len {len(content)}"); return content
275
- else: logger.warning(f"[Crawl4AI] Crawl success but empty content."); return None
276
- except Exception as e: logger.error(f"[Crawl4AI] Error: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
279
- headers = { 'User-Agent': 'Mozilla/5.0 ...', ... } # Keep headers
 
 
 
 
 
 
 
 
280
  try:
281
- async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as c:
282
- r = await c.get(url); logger.debug(f"[BS4 Fetch] Status {r.status_code}"); r.raise_for_status()
283
- ct = r.headers.get('content-type','').lower()
284
- if 'html' not in ct: logger.warning(f"[BS4 Fetch] Non-HTML: {ct}"); return None
285
- try: return r.text
286
- except Exception as e: logger.error(f"[BS4 Fetch] Read error: {e}"); return None
287
- except Exception as e: logger.error(f"[BS4 Fetch] Error: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
  async def get_website_content_bs4(url: str) -> Optional[str]:
290
- if not url: logger.error("[BS4] No URL"); return None
291
- logger.info(f"[BS4] Fetching & parsing: {url}")
 
292
  html_content = await fetch_url_content_for_scrape(url)
293
- if not html_content: logger.warning(f"[BS4] Fetch failed"); return None
 
 
 
294
  try:
 
295
  def parse_html(content):
296
- soup = BeautifulSoup(content, DEFAULT_PARSER) # Parse logic remains
297
- for el in soup(["script", "style", ...]): el.extract()
298
- selectors = ['main', 'article', ...]; target = None
299
- for sel in selectors: try: target=soup.select_one(sel); except Exception: continue; if target: break
300
- if not target: target = soup.body
301
- if not target: logger.warning("[BS4] No body/main"); return None
302
- text = re.sub(r'\s{2,}', ' ', " ".join(l.strip() for l in target.get_text('\n', strip=True).splitlines() if l.strip())).strip()
303
- if not text: logger.warning("[BS4] Empty text"); return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  return text
305
- text = await asyncio.to_thread(parse_html, html_content)
306
- if text: logger.info(f"[BS4] Success: len {len(text)}"); return text
307
- else: logger.warning("[BS4] Parsing empty"); return None
308
- except Exception as e: logger.error(f"[BS4] Parse error: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
309
 
310
  async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
311
- if not url or not api_key: logger.error("[API] Missing URL or key"); return None
312
- logger.info(f"[API] Fetching via urltotext.com: {url}")
313
- api = "https://urltotext.com/api/v1/urltotext/"; payload={...}; headers={...} # Keep details
 
 
 
 
314
  try:
315
- async with httpx.AsyncClient(timeout=45.0) as c:
316
- logger.debug("[API] Sending request...")
317
- r = await c.post(api, headers=headers, json=payload); logger.debug(f"[API] Status {r.status_code}")
318
- if r.status_code == 200:
319
- try: # Parse success response
320
- data=r.json(); content=data.get("data",{}).get("content"); credits=...; warning=...
321
- if warning: logger.warning(f"[API] Warning: {warning}")
322
- if content: logger.info(f"[API] Success: len {len(content)}, Credits: {credits}"); return content.strip()
323
- else: logger.warning(f"[API] Success but empty content. Resp:{data}"); return None
324
- except Exception as e: logger.error(f"[API] Error processing success: {e}", exc_info=True); return None
325
- elif r.status_code == 402: logger.error(f"[API] Error 402 (Credits?). Resp:{r.text[:200]}"); return None
326
- elif r.status_code == 400 and "url" in r.text.lower(): logger.error(f"[API] Error 400 (Bad URL?). Resp:{r.text[:200]}"); return None
327
- # ... other error codes
328
- else: logger.error(f"[API] Unexpected status {r.status_code}. Resp:{r.text[:200]}"); return None
329
- except Exception as e: logger.error(f"[API] Unexpected error: {e}", exc_info=True); return None
 
 
 
 
 
330
 
331
  # --- Summarization Functions ---
332
  async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
 
333
  global GEMINI_MODEL, _gemini_primary_enabled
334
- if not _gemini_primary_enabled: logger.error("[Gemini] Disabled."); return None, "Error: Primary AI unavailable."
335
- if len(text) > MAX_INPUT_TOKEN_APPROX: logger.warning(f"[Gemini] Truncating input..."); text = text[:MAX_INPUT_TOKEN_APPROX]
336
- logger.info(f"[Gemini] Generating {summary_type} summary ({GEMINI_MODEL}). Input len: {len(text)}")
337
- if summary_type == "paragraph": prompt = f"Summarise paragraph...\n{text}\nSummary:"
338
- elif summary_type == "points": prompt = f"Summarise points...\n{text}\nSummary:"
339
- else: logger.error(f"[Gemini] Invalid type: {summary_type}"); return None, f"Error: Invalid summary type."
340
- safety_settings = { category: HarmBlockThreshold.BLOCK_NONE for category in HarmCategory }; logger.info("[Gemini] Safety settings disabled.")
341
- gen_config = genai.types.GenerationConfig( max_output_tokens=2048, temperature=0.7 )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  try:
343
- model = genai.GenerativeModel(GEMINI_MODEL); logger.debug("[Gemini] Sending request...")
344
- response = await model.generate_content_async( prompt, generation_config=gen_config, safety_settings=safety_settings )
345
- if not response.candidates: # Check prompt feedback
346
- block_reason=str(response.prompt_feedback.block_reason or "N/A") if hasattr(response,'prompt_feedback') else "Unknown"
347
- logger.error(f"[Gemini] Response blocked (Prompt). Reason: {block_reason}"); return None, f"Error: Gemini blocked prompt ({block_reason})."
348
- candidate = response.candidates[0] # Check candidate finish reason
349
- finish_val = candidate.finish_reason; finish_str = str(finish_val).upper() if finish_val else "UNSPECIFIED"; logger.debug(f"[Gemini] Finish reason: {finish_val} -> {finish_str}")
350
- ratings = ", ".join([f"{r.category.name}:{r.probability.name}" for r in candidate.safety_ratings]) if hasattr(candidate,'safety_ratings') and candidate.safety_ratings else "N/A"
351
- success = any(finish_str.endswith(reason) for reason in ["STOP", "MAX_TOKENS"])
352
- if not success: logger.error(f"[Gemini] Failed/Unexpected finish. Reason: {finish_str}. Safety: {ratings}"); return None, f"Error: Gemini failed ({finish_str})."
353
- if finish_str.endswith("MAX_TOKENS"): logger.warning("[Gemini] Output truncated.")
354
- summary = ""; extracted = False # Extract text
355
- try: summary = response.text.strip(); extracted = True
356
- except Exception as e: logger.warning(f"[Gemini] response.text error: {e}. Trying parts."); if candidate.content and candidate.content.parts: summary="".join(p.text for p in candidate.content.parts if hasattr(p,"text")).strip(); extracted = True
357
- if not extracted or not summary: logger.warning(f"[Gemini] Empty summary despite finish '{finish_str}'."); return None, "Error: AI generated empty summary."
358
- logger.info(f"[Gemini] Summary extracted: len {len(summary)}."); return summary, None
359
- except Exception as e: logger.error(f"[Gemini] Error: {e}", exc_info=True); return None, f"Error: Failed communication with Gemini ({e})."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
  async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
 
362
  global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
363
  if not _openrouter_fallback_enabled: logger.error("[OR] Disabled."); return None, "Error: Fallback AI unavailable."
364
- if len(text) > 100000: logger.warning("[OR] Truncating input..."); text = text[:100000]
 
 
365
  logger.info(f"[OR] Generating {summary_type} summary ({OPENROUTER_MODEL}). Input len: {len(text)}")
366
- if summary_type == "paragraph": prompt = f"Summarise paragraph...\n{text}\nSummary:"
367
- elif summary_type == "points": prompt = f"Summarise points...\n{text}\nSummary:"
368
- else: logger.error(f"[OR] Invalid type: {summary_type}"); return None, "Error: Invalid summary type."
369
- headers={...}; payload={...}; api="https://openrouter.ai/api/v1/chat/completions" # Keep details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  try:
371
- async with httpx.AsyncClient(timeout=120.0) as c:
372
- logger.debug("[OR] Sending request..."); r = await c.post(api, headers=headers, json=payload); logger.debug(f"[OR] Status {r.status_code}")
373
- if r.status_code == 200:
374
- try: # Parse success
375
- data=r.json(); choice=data.get("choices",[{}])[0]; msg=choice.get("message"); finish=choice.get("finish_reason","N/A")
376
- if msg and msg.get("content"): summary = msg["content"].strip()
377
- else: summary = None
378
- if summary: logger.info(f"[OR] Success: len {len(summary)}, Finish: {finish}"); if finish=='length': logger.warning("..."); return summary, None
379
- else: logger.warning(f"[OR] Empty summary content. Data:{data}"); return None, "Error: Fallback AI empty summary."
380
- except Exception as e: logger.error(f"[OR] Error processing success: {e}", exc_info=True); return None, f"Error: Failed processing Fallback AI response ({e})."
381
- else: # Handle errors
382
- err_msg = f"Error: Fallback AI ({OPENROUTER_MODEL}) status {r.status_code}."; try: details=r.json().get("error",{}).get("message",r.text[:200]); err_msg+=f" Details:{details}"; except Exception: err_msg+=f" Resp:{r.text[:200]}"; logger.error(f"[OR] {err_msg}"); return None, err_msg
383
- except Exception as e: logger.error(f"[OR] Unexpected error: {e}", exc_info=True); return None, f"Error: Unexpected issue with Fallback AI ({e})."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  async def generate_summary(text: str, summary_type: str) -> str:
 
386
  global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
387
- logger.info(f"[Summary] Starting..."); err_msg = None
388
- if _gemini_primary_enabled: # Try Gemini
389
- logger.info(f"[Summary] Trying Gemini ({GEMINI_MODEL})")
390
- summary, err = await _call_gemini(text, summary_type)
391
- if summary: logger.info("[Summary] Success with Gemini."); return summary
392
- else: logger.warning(f"[Summary] Gemini failed: {err}. Falling back."); err_msg = f"Primary AI failed: {err}"
393
- else: logger.warning("[Summary] Gemini disabled."); err_msg = "Primary AI unavailable."
394
- if _openrouter_fallback_enabled: # Try OpenRouter
395
- logger.info(f"[Summary] Trying OpenRouter ({OPENROUTER_MODEL})")
396
- summary, err = await _call_openrouter(text, summary_type)
397
- if summary: logger.info("[Summary] Success with OpenRouter."); return summary
398
- else: logger.error(f"[Summary] OpenRouter also failed: {err}"); if err_msg: return f"{err_msg}\nFallback failed: {err}"; else: return f"Fallback AI failed: {err}"
399
- else: logger.error("[Summary] OpenRouter disabled."); if err_msg: return f"{err_msg}\nFallback unavailable."; else: return "Error: Both AI unavailable."
400
- logger.error("[Summary] Reached end unexpectedly."); final_err = err_msg or "Unknown error."; return f"Sorry, error: {final_err}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
  # --- Main Processing Task ---
403
  async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
404
- task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting: {url}")
405
- bot: Optional[Bot] = None; content: Optional[str] = None; feedback: Optional[str] = None; success = False; original_msg_id = message_id_to_edit
406
- try: bot = Bot(token=bot_token, request=HTTPXRequest(...)) # Keep bot init
407
- except Exception as e: logger.critical(f"[Task {task_id}] Failed bot init: {e}", exc_info=True); return
408
- try: # Main task logic
409
- # Edit original msg to "Processing..."
410
- proc_msg = f"Generating '{summary_type}' summary...\nThis might take a moment..."
 
 
 
 
 
 
 
 
 
 
 
 
411
  if original_msg_id:
412
- try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=original_msg_id, text=proc_msg, ...); logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id}")
413
- except Exception as e: logger.warning(f"[Task {task_id}] Failed edit original msg {original_msg_id}: {e}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
415
  except Exception: pass
416
 
417
- # Get Content (YT or Web)
418
- is_yt = is_youtube_url(url); logger.debug(f"[Task {task_id}] Type: {'YT' if is_yt else 'Web'}")
419
- if is_yt: # YT Logic
 
420
  vid = extract_youtube_id(url)
421
  if vid: content = await get_youtube_transcript(vid, url)
422
- else: feedback = "Invalid YT URL."
423
- if not content and not feedback: feedback = "Could not get YT transcript."
424
- else: # Web Logic
425
- logger.info(f"[Task {task_id}] Trying Crawl4AI..."); content = await get_website_content_via_crawl4ai(url)
426
- if not content: logger.warning(f"[Task {task_id}] Crawl4AI fail. Trying BS4..."); try: await retry_bot_operation(bot.send_chat_action,...); except Exception: pass; content = await get_website_content_bs4(url)
427
- if not content: logger.warning(f"[Task {task_id}] BS4 fail. Trying API..."); global URLTOTEXT_API_KEY, _urltotext_fallback_enabled
428
- if not content and _urltotext_fallback_enabled: try: await retry_bot_operation(bot.send_chat_action,...); except Exception: pass; content = await get_website_content_via_api(url, URLTOTEXT_API_KEY); if not content: feedback = "Fetch failed (Crawl4AI/BS4/API fail/credits?)."
429
- elif not content: feedback = "Fetch failed (Crawl4AI/BS4 fail, API disabled)."
430
- if not content and not feedback: feedback = "Could not fetch website content."
431
-
432
- # Generate Summary
 
 
 
 
 
 
 
 
 
 
 
 
433
  if content and not feedback:
434
- logger.info(f"[Task {task_id}] Got content (len:{len(content)}). Generating summary..."); try: await retry_bot_operation(bot.send_chat_action,...); except Exception: pass
 
435
  final_summary = await generate_summary(content, summary_type)
436
- if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): feedback = final_summary; logger.warning(f"[Task {task_id}] Summary gen failed: {feedback}"); success = False
437
- else: # Success - Split & Send
438
- parts = []; current = ""; lines = final_summary.splitlines(True) # Keep splitting
 
 
 
 
 
 
439
  for line in lines:
440
- if len(current) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
441
- if current.strip(): parts.append(current.strip())
442
- current = line[:MAX_SUMMARY_CHUNK_SIZE] if len(line) > MAX_SUMMARY_CHUNK_SIZE else line
443
- else: current += line
444
- if current.strip(): parts.append(current.strip())
445
- if not parts: parts.append("Summary empty."); logger.warning(...)
446
- logger.info(f"[Task {task_id}] Summary OK (len:{len(final_summary)}). Sending {len(parts)} part(s).")
447
- edited = False # Edit original msg with first part
448
- if original_msg_id: try: await retry_bot_operation(bot.edit_message_text, chat_id, original_msg_id, parts[0], ...); logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id} part 1."); edited = True; except Exception as e: logger.warning(f"[Task {task_id}] Failed edit original msg {original_msg_id} part 1: {e}. Sending new.")
449
- if not edited: sent = await retry_bot_operation(bot.send_message, chat_id, parts[0], ...); if not sent: feedback = "Failed send summary."; success = False; logger.error(...) # Send first part new if needed
450
- if success and len(parts) > 1: # Send remaining parts
451
- for i, part in enumerate(parts[1:], 2): await asyncio.sleep(0.5); try: await retry_bot_operation(bot.send_message, chat_id, part, ...); logger.debug(f"[Task {task_id}] Sent part {i}/{len(parts)}."); except Exception as e: feedback=f"Failed send part {i}."; success=False; logger.error(...); break
452
- if not feedback: success = True # Confirm success
453
-
454
- # Handle Failures
455
- if not success:
456
- if not feedback: feedback = "Unknown error."; logger.error(...)
457
- logger.warning(f"[Task {task_id}] Sending failure feedback: {feedback}")
458
- try: # Edit original msg with error
459
- edited_err = False
460
- if original_msg_id: try: await retry_bot_operation(bot.edit_message_text, chat_id, original_msg_id, feedback, ...); logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id} with error."); edited_err=True; except Exception as e: logger.warning(f"[Task {task_id}] Failed edit original msg {original_msg_id} with error: {e}. Sending new.")
461
- if not edited_err: await retry_bot_operation(bot.send_message, chat_id, feedback, ...); logger.debug(f"[Task {task_id}] Sent error new msg.")
462
- except Exception as e: logger.error(f"[Task {task_id}] Failed even sending error feedback: {e}")
463
-
464
- except Exception as e: # Catch-all for task
465
- logger.error(f"[Task {task_id}] Unexpected task error: {e}", exc_info=True); success = False; feedback = "Oops! Unexpected error..."
466
- if bot: try: edited=False; if original_msg_id: try: await retry_bot_operation(bot.edit_message_text,...); edited=True; except Exception: pass; if not edited: await retry_bot_operation(bot.send_message,...) except Exception as final_e: logger.error(f"[Task {task_id}] Failed final crash error feedback: {final_e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  finally: # Cleanup
468
- if bot and bot.request and hasattr(bot.request, '_client') and bot.request._client: try: await bot.request._client.aclose(); logger.debug(f"[Task {task_id}] BG client closed.") except Exception as e: logger.warning(f"[Task {task_id}] Error closing BG client: {e}")
469
- logger.info(f"[Task {task_id}] Completed. Success: {success}")
 
 
 
 
470
 
471
  # --- Telegram Handlers ---
472
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
@@ -491,10 +1014,13 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
491
  if not update.message or not update.message.text: return
492
  url = update.message.text.strip(); user = update.effective_user;
493
  if not user: return
494
- if not re.match(r'https?://[^\s/$.?#].[^\s]*', url, re.I): logger.debug(...); await update.message.reply_text("Invalid URL..."); return
 
 
495
  logger.info(f"User {user.id} sent URL: {url}")
496
- context.user_data['url_to_summarize'] = url; context.user_data['original_message_id'] = update.message.message_id
497
- keys = [[ InlineKeyboardButton("Para", callback_data="paragraph"), InlineKeyboardButton("Points", callback_data="points") ]]
 
498
  markup = InlineKeyboardMarkup(keys)
499
  await update.message.reply_html( f"Link:\n<code>{html.escape(url)}</code>\n\nSummary type?", reply_markup=markup, disable_web_page_preview=True )
500
 
@@ -504,12 +1030,17 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
504
  user = query.from_user; summary_type = query.data; qid = query.id
505
  try: await query.answer(); logger.debug(f"Ack cb {qid} from {user.id}")
506
  except Exception as e: logger.warning(f"Err answering cb {qid}: {e}")
 
507
  url = context.user_data.get('url_to_summarize'); msg_id = query.message.message_id
508
  logger.info(f"User {user.id} chose '{summary_type}' for msg {msg_id}. URL context: {'Yes' if url else 'No'}")
509
- if not url: logger.warning(f"No URL context user {user.id} (cb {qid})."); try: await query.edit_message_text("Context lost...", reply_markup=None); except Exception as e: logger.error(f"Failed edit 'URL not found': {e}"); return
 
 
 
 
510
 
511
  global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
512
- # Expanded checks from previous answer
513
  if not TELEGRAM_TOKEN:
514
  logger.critical("TG TOKEN missing!")
515
  try: await query.edit_message_text("❌ Bot config error (Token).", reply_markup=None)
@@ -523,88 +1054,163 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
523
  if not _gemini_primary_enabled: logger.warning("Primary AI unavailable, using fallback.")
524
  elif not _openrouter_fallback_enabled: logger.warning("Fallback AI unavailable, using primary.")
525
 
 
526
  logger.info(f"Scheduling task user {user.id}...")
527
  asyncio.ensure_future( process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=msg_id, url=url, summary_type=summary_type, bot_token=TELEGRAM_TOKEN ) )
 
528
  context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None)
529
  logger.debug(f"Cleared context user {user.id} post-schedule.")
530
 
531
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
532
  ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter)
533
- if isinstance(context.error, ignore_errors): # Keep ignore logic
534
- ignore_messages = ["message is not modified", ...]
535
  err_str = str(context.error).lower()
536
  if any(msg in err_str for msg in ignore_messages) or isinstance(context.error, (TimedOut, NetworkError, RetryAfter)):
537
- logger.warning(f"Ignoring known/transient error: {context.error}"); return
 
538
  logger.error("Exception handling update:", exc_info=context.error)
539
 
540
  # --- Application Setup ---
541
- # (setup_bot_config, lifespan, health_check, telegram_webhook, ASGI App Def, Dev Runner) - No changes needed
542
  async def setup_bot_config() -> Application:
543
  logger.info("Configuring Telegram App..."); global TELEGRAM_TOKEN
544
  if not TELEGRAM_TOKEN: raise ValueError("TG TOKEN missing.")
545
- req = HTTPXRequest(...); app = ( Application.builder().token(TELEGRAM_TOKEN).request(req).build() )
546
- app.add_handler(CommandHandler("start", start)); app.add_handler(CommandHandler("help", help_command))
 
 
547
  url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
548
- app.add_handler(MessageHandler(url_filter, handle_potential_url)); app.add_handler(CallbackQueryHandler(handle_summary_type_callback))
549
- app.add_error_handler(error_handler); logger.info("TG handlers configured."); return app
 
 
550
 
 
551
  @contextlib.asynccontextmanager
552
  async def lifespan(app: Starlette):
553
- global ptb_app, ...; logger.info("Lifespan: Startup..."); # Keep lifespan logic
554
- if not TELEGRAM_TOKEN: raise RuntimeError("TG token missing.")
555
  try:
556
- ptb_app = await setup_bot_config(); await ptb_app.initialize(); bot_info = await ptb_app.bot.get_me(); logger.info(f"Bot init: @{bot_info.username}")
557
- # Webhook setup logic remains the same
 
558
  current_info = await ptb_app.bot.get_webhook_info(); deleted_ok = True
559
- if current_info and current_info.url: logger.info(f"Deleting webhook: {current_info.url}..."); try: deleted_ok = await ptb_app.bot.delete_webhook(...); logger.info("WH deleted." if deleted_ok else "WH delete fail.") except Exception as e: logger.warning(f"WH delete err: {e}"); deleted_ok = False
 
 
 
 
560
  host = os.environ.get("SPACE_HOST"); path="/webhook";
561
  if not host: raise RuntimeError("SPACE_HOST missing.")
562
  wh_url = f"https://{host.split('://')[-1].rstrip('/')}{path}"
 
563
  if wh_url and deleted_ok:
564
- logger.info(f"Setting webhook: {wh_url}"); args = {...}; if WEBHOOK_SECRET: args["secret_token"] = WEBHOOK_SECRET;
565
- await asyncio.sleep(1.0);
 
 
566
  try:
567
- if not await ptb_app.bot.set_webhook(**args): raise RuntimeError("set_webhook False.")
568
  await asyncio.sleep(1.5); info = await ptb_app.bot.get_webhook_info()
569
  if not (info and info.url == wh_url): raise RuntimeError(f"WH verify fail! Expected '{wh_url}', Got: {info}")
570
- logger.info(f"WH set & verified: URL='{info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}"); if info.last_error_message: logger.warning(...)
571
  await ptb_app.start(); logger.info("PTB started (webhook).")
572
  except Exception as e: logger.error(f"FATAL: WH setup error: {e}", exc_info=True); raise RuntimeError(f"WH setup fail: {e}") from e
573
- elif not deleted_ok: raise RuntimeError("Failed previous WH delete.")
 
574
  logger.info("Lifespan: Startup complete."); yield
575
- except Exception as startup_err: logger.critical(f"Startup fail: {startup_err}", exc_info=True); raise
576
  finally: # Shutdown
577
  logger.info("Lifespan: Shutdown...");
578
- if ptb_app: try: if ptb_app.running: await ptb_app.stop(); if ptb_app._initialized: await ptb_app.shutdown(); logger.info("PTB stopped/shutdown.") except Exception as e: logger.error(...)
 
 
 
 
579
  logger.info("Lifespan: Shutdown complete.")
580
 
581
  async def health_check(request: Request) -> PlainTextResponse:
582
- global OPENROUTER_MODEL, ...; bot_status = "Not Initialized"; bot_username = "N/A" # Keep health check logic
 
583
  if ptb_app and ptb_app.bot and ptb_app._initialized:
584
- try: wh_info = await ptb_app.bot.get_webhook_info(); ... # Keep status check
585
- except Exception as e: ...; bot_status = f"Error: {e}"
 
 
 
 
 
 
 
 
586
  elif ptb_app: bot_status = "Initializing..."
587
- info = [ f"=== Bot Status ===", f"Application: {bot_status}", "--- Services ---", f"Web Scraper: {'Crawl4AI (ignore_robots)' if _crawl4ai_primary_web_enabled else 'DISABLED'}", ...] # Keep health info format
588
- return PlainTextResponse("\n".join(info))
 
 
 
 
 
 
 
 
 
 
 
589
 
590
  async def telegram_webhook(request: Request) -> Response:
591
- global WEBHOOK_SECRET, ptb_app # Keep webhook logic
592
- if not ptb_app or not ptb_app._initialized or not ptb_app.running: status = ...; logger.error(...); return PlainTextResponse(f'Bot {status}', 503)
593
- if WEBHOOK_SECRET: ... # Keep secret check
594
- try: data = await request.json(); update = Update.de_json(data, ptb_app.bot); logger.debug(...); await ptb_app.process_update(update); return Response(200)
595
- except json.JSONDecodeError: logger.error(...); return PlainTextResponse('Bad Request', 400)
596
- except Exception as e: logger.error(f"Webhook proc error: {e}", exc_info=True); return Response(200) # OK to TG
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
 
598
  # --- ASGI App Definition ---
599
- app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", health_check), Route("/webhook", telegram_webhook, methods=["POST"]), ] )
 
 
 
 
 
 
 
600
  logger.info("Starlette ASGI application created.")
601
 
602
  # --- Development Runner ---
603
  if __name__ == '__main__':
604
- import uvicorn; logger.warning("Running DEV mode...") # Keep dev runner
605
- log_level = ...; port = ...;
606
- try: from dotenv import load_dotenv; load_dotenv(); logger.info(".env loaded.")
607
- except ImportError: logger.info(".env not loaded.")
608
- if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TOKEN missing.") # Keep checks
 
 
 
 
 
 
 
609
  if not get_secret('GEMINI_API_KEY'): logger.error("Local Dev: GEMINI_API_KEY missing.")
610
  uvicorn.run( "main:app", host='0.0.0.0', port=port, log_level=log_level, reload=True )
 
53
  # --- Google Gemini ---
54
  try:
55
  import google.generativeai as genai
56
+ # Import specific types needed, check library for exact names if errors occur
57
  from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
58
  _gemini_available = True
59
  except ImportError:
 
74
  logging.getLogger('uvicorn').setLevel(logging.INFO)
75
  logging.getLogger('starlette').setLevel(logging.INFO)
76
  if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING)
77
+ # Suppress noisy crawl4ai/playwright logs if needed
78
  logging.getLogger("crawl4ai").setLevel(logging.INFO) # Adjust level as needed
79
  logging.getLogger("playwright").setLevel(logging.WARNING)
80
 
 
90
  # --- Environment Variable Loading & Configuration ---
91
  logger.info("Attempting to load secrets and configuration...")
92
  def get_secret(secret_name):
93
+ # (Function remains the same)
94
  value = os.environ.get(secret_name)
95
+ if value:
96
+ status = "Found"
97
+ log_length = min(len(value), 8)
98
+ value_start = value[:log_length]
99
+ logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)")
100
+ else:
101
+ status = "Not Found"
102
+ logger.warning(f"Secret '{secret_name}': {status}")
103
  return value
104
 
105
  TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
106
+ OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Fallback Summarizer
107
+ URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Fallback Web Scraper 2
108
+ SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # Fallback YT Transcript 1
109
+ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # Fallback YT Transcript 2
110
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
111
+ GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer
112
 
113
+ # Models (User can still configure via env vars)
114
+ OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
115
  APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
116
+ GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Model
117
 
118
  # --- Configuration Checks ---
119
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
120
+ if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.")
121
+ if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.")
122
 
123
  _gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY)
124
+ if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.")
125
+ elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY not found or empty. Gemini disabled.")
126
+
127
  _openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
128
+ if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.")
129
+
130
  _crawl4ai_primary_web_enabled = _crawl4ai_available
131
+ if not _crawl4ai_primary_web_enabled: logger.warning("⚠️ WARNING: crawl4ai library missing. Primary Web Scraper disabled.")
 
132
 
133
+ _urltotext_fallback_enabled = bool(URLTOTEXT_API_KEY)
134
+ if not _urltotext_fallback_enabled: logger.info("ℹ️ INFO: URLTOTEXT_API_KEY not found. Fallback Web Scraper 2 (API) disabled.")
135
+ else: logger.info("ℹ️ INFO: URLTOTEXT_API_KEY found. Fallback Web Scraper 2 (API) enabled.")
136
 
137
+ if not SUPADATA_API_KEY: logger.info("ℹ️ INFO: SUPADATA_API_KEY not found. Fallback YT Transcript 1 (API) disabled.")
138
+ if not APIFY_API_TOKEN: logger.info("ℹ️ INFO: APIFY_API_TOKEN not found. Fallback YT Transcript 2 (API) disabled.")
139
+ if not WEBHOOK_SECRET: logger.info("ℹ️ INFO: Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.")
 
 
140
 
141
  logger.info("Secret loading and configuration check finished.")
142
  logger.info(f"Primary Web Scraper: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED'}")
143
+ logger.info(f"Fallback Web Scraper 1: BeautifulSoup (Always available)")
144
  logger.info(f"Fallback Web Scraper 2: urltotext.com API {'ENABLED' if _urltotext_fallback_enabled else 'DISABLED'}")
145
  logger.info(f"Primary Summarizer: Gemini ({GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'})")
146
  logger.info(f"Fallback Summarizer: OpenRouter ({OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'})")
147
+ logger.info(f"Primary YT Transcript: youtube-transcript-api (Always available)")
148
  logger.info(f"Fallback YT Transcript 1: Supadata API {'ENABLED' if SUPADATA_API_KEY else 'DISABLED'}")
149
  logger.info(f"Fallback YT Transcript 2: Apify REST API {'ENABLED' if APIFY_API_TOKEN else 'DISABLED'}")
150
+ _apify_token_exists = bool(APIFY_API_TOKEN) # Keep this for health check
151
 
152
  if _gemini_primary_enabled:
153
+ try:
154
+ genai.configure(api_key=GEMINI_API_KEY)
155
+ logger.info("Google GenAI client configured successfully.")
156
+ except Exception as e:
157
+ logger.error(f"Failed to configure Google GenAI client: {e}")
158
+ _gemini_primary_enabled = False
159
 
160
  # --- Constants ---
161
  MAX_SUMMARY_CHUNK_SIZE = 4000
162
  MAX_INPUT_TOKEN_APPROX = 500000
163
 
164
  # --- Retry Decorator ---
165
+ @retry(
166
+ stop=stop_after_attempt(4),
167
+ wait=wait_exponential(multiplier=1, min=2, max=15),
168
+ retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)),
169
+ before_sleep=before_sleep_log(logger, logging.WARNING),
170
+ reraise=True
171
+ )
172
  async def retry_bot_operation(func, *args, **kwargs):
173
+ try:
174
+ return await func(*args, **kwargs)
175
  except BadRequest as e:
176
+ ignore_errors = [
177
+ "message is not modified", "query is too old", "message to edit not found",
178
+ "chat not found", "bot was blocked by the user",
179
+ ]
180
+ if any(err in str(e).lower() for err in ignore_errors):
181
+ logger.warning(f"Ignoring non-critical BadRequest: {e}")
182
+ return None
183
+ logger.error(f"Potentially critical BadRequest: {e}")
184
+ raise
185
  except TelegramError as e:
186
+ if isinstance(e, (TimedOut, NetworkError, RetryAfter)):
187
+ logger.warning(f"Telegram transient error (will retry): {e}")
188
+ else:
189
+ logger.error(f"Unhandled TelegramError: {e}")
190
+ raise
191
+ except Exception as e:
192
+ logger.error(f"Unexpected error during bot operation: {e}", exc_info=True)
193
  raise
 
194
 
195
  # --- Helper Functions ---
196
  def is_youtube_url(url):
197
  youtube_regex = re.compile(r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?([\w-]{11})(?:\S+)?', re.IGNORECASE)
198
+ match = youtube_regex.search(url)
199
+ logger.debug(f"is_youtube_url '{url}': {bool(match)}")
200
+ return bool(match)
201
+
202
  def extract_youtube_id(url):
203
  youtube_regex = re.compile(r'(?:https?://)?(?:www.)?(?:m.)?(?:youtube(?:-nocookie)?.com|youtu.be)/(?:watch?v=|embed/|v/|shorts/|live/|attribution_link?a=.&u=/watch?v=)?([\w-]{11})(?:\S+)?', re.IGNORECASE)
204
  match = youtube_regex.search(url)
205
+ if match:
206
+ video_id = match.group(1)
207
+ logger.debug(f"Extracted YT ID '{video_id}' from {url}")
208
+ return video_id
209
+ else:
210
+ logger.warning(f"Could not extract YT ID from {url}")
211
+ return None
212
 
213
  # --- Content Fetching Functions ---
214
  async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
215
+ if not video_id: logger.error("[Supadata] No video_id provided"); return None
216
+ if not api_key: logger.error("[Supadata] API key missing."); return None
217
+ logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
218
+ api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"
219
+ # Note: Param name might be 'videoId' based on earlier logs, adjust if needed
220
+ params = {"videoId": video_id, "format": "text"}
221
+ headers = {"X-API-Key": api_key}
222
  try:
223
  async with httpx.AsyncClient(timeout=30.0) as client:
224
+ response = await client.get(api_endpoint, headers=headers, params=params)
225
+ logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
226
  if response.status_code == 200:
227
  try:
228
+ # Attempt to decode JSON, fall back to raw text if needed
229
+ try: data = response.json()
230
+ except json.JSONDecodeError: data = None
231
+ content = None
232
+ # Check various possible response structures
233
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
234
+ # If JSON parsing failed or content key not found, try raw text
235
  if not content and response.text: content = response.text
236
+ if content and isinstance(content, str):
237
+ logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}")
238
+ return content.strip()
239
+ else:
240
+ logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}")
241
+ return None
242
+ except Exception as e:
243
+ logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True)
244
+ return None
245
+ elif response.status_code in [401, 403]:
246
+ logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key.")
247
+ return None
248
+ elif response.status_code == 404:
249
+ logger.warning(f"[Supadata] Not found (404) for {video_id}.")
250
+ return None
251
+ else:
252
+ logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}")
253
+ return None
254
+ except httpx.TimeoutException:
255
+ logger.error(f"[Supadata] Timeout connecting for {video_id}")
256
+ return None
257
+ except httpx.RequestError as e:
258
+ # Log specific errors like SSL verification failure
259
+ if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}")
260
+ else: logger.error(f"[Supadata] Request error for {video_id}: {e}")
261
+ return None
262
+ except Exception as e:
263
+ logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True)
264
+ return None
265
 
266
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
267
  global APIFY_ACTOR_ID
268
+ if not video_url: logger.error("[Apify] No video_url provided"); return None
269
+ if not api_token: logger.error("[Apify] API token missing."); return None
270
+ logger.info(f"[Apify] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
271
+ sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
272
+ params = {"token": api_token}
273
+ # *** RESTORED Full Payload ***
274
+ payload = {
275
+ "urls": [video_url],
276
+ "outputFormat": "singleStringText",
277
+ "maxRetries": 5,
278
+ "channelHandleBoolean": False,
279
+ "channelNameBoolean": False,
280
+ "datePublishedBoolean": False,
281
+ "relativeDateTextBoolean": False,
282
+ }
283
+ headers = {"Content-Type": "application/json"}
284
  try:
285
  async with httpx.AsyncClient(timeout=120.0) as client:
286
+ log_headers = {k: v for k, v in headers.items()} # Avoid logging token in params
287
+ logger.debug(f"[Apify] POST Request Details:\nURL: {sync_items_endpoint}\nParams: {params}\nHeaders: {log_headers}\nPayload: {json.dumps(payload)}")
288
+ response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
289
+ logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
290
  if response.status_code == 200:
291
  try:
292
+ results = response.json()
293
  if isinstance(results, list) and len(results) > 0:
294
+ item = results[0]
295
+ content = None
296
+ # Prioritize specific keys, fall back to others
297
  if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
298
  elif "text" in item and isinstance(item["text"], str): content = item["text"]
299
+ elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
300
+ # Handle list of caption segments if primary keys fail
301
+ elif "captions" in item and isinstance(item["captions"], list):
302
+ if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]:
303
+ content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
304
+ elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str):
305
+ content = " ".join(item["captions"])
306
+ # Final check if content was found and is a string
307
+ if content and isinstance(content, str):
308
+ logger.info(f"[Apify] Success via REST for {video_url}. Length: {len(content)}")
309
+ return content.strip()
310
+ else:
311
+ logger.warning(f"[Apify] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}")
312
+ return None
313
+ else:
314
+ logger.warning(f"[Apify] Actor success but dataset was empty for {video_url}. Response: {results}")
315
+ return None
316
+ except json.JSONDecodeError:
317
+ logger.error(f"[Apify] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}")
318
+ return None
319
+ except Exception as e:
320
+ logger.error(f"[Apify] Error processing success response for {video_url}: {e}", exc_info=True)
321
+ return None
322
+ elif response.status_code == 400:
323
+ logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}")
324
+ return None
325
+ elif response.status_code == 401:
326
+ logger.error("[Apify] Auth error (401). Check token.")
327
+ return None
328
  elif response.status_code == 404:
329
+ error_info = "Unknown 404 Error"
330
  try:
 
331
  error_data = response.json()
332
+ error_info = error_data.get("error", {}).get("message", "No specific message")
333
  except Exception:
334
+ error_info = response.text[:200]
335
+ logger.error(f"[Apify] Endpoint/Actor Not Found (404). Error: '{error_info}'")
336
+ return None
337
+ else:
338
+ logger.error(f"[Apify] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}")
339
+ return None
340
+ except httpx.TimeoutException as e:
341
+ logger.error(f"[Apify] Timeout during API interaction for {video_url}: {e}")
342
+ return None
343
+ except httpx.HTTPStatusError as e:
344
+ logger.error(f"[Apify] HTTP Status Error during API interaction for {video_url}: {e}")
345
+ return None
346
+ except httpx.RequestError as e:
347
+ logger.error(f"[Apify] Request error during API interaction for {video_url}: {e}")
348
+ return None
349
+ except Exception as e:
350
+ logger.error(f"[Apify] Unexpected error during Apify REST call for {video_url}: {e}", exc_info=True)
351
+ return None
352
 
353
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
354
  global SUPADATA_API_KEY, APIFY_API_TOKEN
355
+ if not video_id: logger.error("YT transcript: No video_id provided"); return None
356
+ logger.info(f"Fetching YT transcript for video ID: {video_id} (URL: {video_url})")
357
+ transcript_text = None
358
+ logger.info("[Primary YT] Attempting youtube-transcript-api...")
359
  try:
360
+ transcript_list = await asyncio.to_thread(
361
+ YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch
362
+ )
363
+ if transcript_list:
364
+ transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
365
+ if transcript_text:
366
+ logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})")
367
+ return transcript_text.strip()
368
+ else:
369
+ logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}")
370
+ transcript_text = None
371
+ except TranscriptsDisabled:
372
+ logger.warning(f"[Primary YT] Transcripts are disabled for video {video_id}")
373
+ transcript_text = None
374
+ except NoTranscriptFound:
375
+ logger.warning(f"[Primary YT] No English transcript found for video {video_id}")
376
+ transcript_text = None
377
+ except Exception as e:
378
+ logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}")
379
+ transcript_text = None
380
+
381
+ # Fallback 1: Supadata
382
+ if transcript_text is None:
383
+ logger.info("[Fallback YT 1] Trying Supadata API...")
384
+ if SUPADATA_API_KEY:
385
+ transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
386
+ if transcript_text:
387
+ logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}")
388
+ return transcript_text # Already stripped
389
+ else:
390
+ logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
391
+ else:
392
+ logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
393
+
394
+ # Fallback 2: Apify
395
+ if transcript_text is None:
396
+ logger.info("[Fallback YT 2] Trying Apify REST API...")
397
+ if APIFY_API_TOKEN:
398
+ transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
399
+ if transcript_text:
400
+ logger.info(f"[Fallback YT 2] Success via Apify REST for {video_url}")
401
+ return transcript_text # Already stripped
402
+ else:
403
+ logger.warning(f"[Fallback YT 2] Apify REST failed or no content for {video_url}.")
404
+ else:
405
+ logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
406
+
407
+ if transcript_text is None:
408
+ logger.error(f"All methods failed for YT transcript: {video_id}")
409
+ return None
410
+ return transcript_text # Should be stripped if found
411
 
412
  async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
413
+ """Fetches website content using Crawl4AI (Primary Method)."""
414
  global _crawl4ai_primary_web_enabled
415
+ if not _crawl4ai_primary_web_enabled: logger.error("[Crawl4AI] Lib not available."); return None
416
+ if not url: logger.error("[Crawl4AI] No URL provided."); return None
417
+ logger.info(f"[Crawl4AI] Attempting crawl: {url}")
418
  try:
419
+ # Initialize with ignore_robots=True to bypass cache/permission issues
420
  async with AsyncWebCrawler(ignore_robots=True) as crawler:
421
+ logger.info(f"[Crawl4AI] Initialized crawler (ignore_robots=True).")
422
  result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90)
423
+
424
  content = None
425
+ if result and result.markdown:
426
+ content = result.markdown.strip()
427
+ elif result and result.text: # Fallback if markdown is missing
428
+ content = result.text.strip()
429
+
430
+ if content:
431
+ logger.info(f"[Crawl4AI] Success crawling {url}. Content length: {len(content)}")
432
+ return content
433
+ else:
434
+ logger.warning(f"[Crawl4AI] Crawl successful for {url}, but extracted content (markdown/text) is empty.")
435
+ return None
436
+ except asyncio.TimeoutError:
437
+ logger.error(f"[Crawl4AI] Timeout occurred while crawling {url}")
438
+ return None
439
+ except PermissionError as e: # Should not happen with ignore_robots=True, but keep for logging
440
+ logger.error(f"[Crawl4AI] Permission denied during crawl for {url}. Path: '{e.filename}'. Error: {e}", exc_info=True)
441
+ return None
442
+ except Exception as e:
443
+ logger.error(f"[Crawl4AI] Unexpected error during crawl for {url}: {e}", exc_info=True)
444
+ return None
445
 
446
  async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
447
+ """Fetches HTML content for BeautifulSoup."""
448
+ headers = {
449
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
450
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # Allow image/webp
451
+ 'Accept-Language': 'en-US,en;q=0.5',
452
+ 'Connection': 'keep-alive',
453
+ 'DNT': '1', # Do Not Track
454
+ 'Upgrade-Insecure-Requests': '1'
455
+ }
456
  try:
457
+ async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
458
+ logger.debug(f"[BS4 Fetch] Sending request to {url}")
459
+ response = await client.get(url)
460
+ logger.debug(f"[BS4 Fetch] Received response {response.status_code} from {url}")
461
+ response.raise_for_status() # Raise HTTP errors
462
+ content_type = response.headers.get('content-type', '').lower()
463
+ if 'html' not in content_type:
464
+ logger.warning(f"[BS4 Fetch] Non-HTML content type from {url}: {content_type}")
465
+ return None
466
+ try:
467
+ # Let httpx handle decoding
468
+ return response.text
469
+ except Exception as e:
470
+ logger.error(f"[BS4 Fetch] Error reading response text for {url}: {e}")
471
+ return None
472
+ except httpx.HTTPStatusError as e:
473
+ logger.error(f"[BS4 Fetch] HTTP error {e.response.status_code} fetching {url}: {e}")
474
+ except httpx.TimeoutException:
475
+ logger.error(f"[BS4 Fetch] Timeout error fetching {url}")
476
+ except httpx.TooManyRedirects:
477
+ logger.error(f"[BS4 Fetch] Too many redirects fetching {url}")
478
+ except httpx.RequestError as e:
479
+ logger.error(f"[BS4 Fetch] Request error fetching {url}: {e}")
480
+ except Exception as e:
481
+ logger.error(f"[BS4 Fetch] Unexpected error fetching {url}: {e}", exc_info=True)
482
+ return None
483
 
484
  async def get_website_content_bs4(url: str) -> Optional[str]:
485
+ """Fetches and parses website content using BeautifulSoup (Fallback 1)."""
486
+ if not url: logger.error("[BS4] No URL provided"); return None
487
+ logger.info(f"[BS4] Attempting basic fetch & parse for: {url}")
488
  html_content = await fetch_url_content_for_scrape(url)
489
+ if not html_content:
490
+ logger.warning(f"[BS4] Failed to fetch HTML for {url}")
491
+ return None
492
+
493
  try:
494
+ # Inner function for parsing (runs in thread)
495
  def parse_html(content):
496
+ soup = BeautifulSoup(content, DEFAULT_PARSER)
497
+ # Remove common non-content elements more aggressively
498
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio", ".advertisement", ".ad", ".sidebar", ".popup", ".modal"]):
499
+ try:
500
+ element.extract()
501
+ except: pass # Ignore if element already removed
502
+ # Try various selectors for main content
503
+ selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body', '.post-content', '.entry-content', '.page-content']
504
+ target_element = None
505
+ for selector in selectors:
506
+ try:
507
+ target_element = soup.select_one(selector)
508
+ if target_element: break
509
+ except Exception as sel_e:
510
+ logger.warning(f"[BS4] Invalid selector '{selector}': {sel_e}")
511
+ continue
512
+ # Fallback to body if no main element found
513
+ if not target_element: target_element = soup.body
514
+ if not target_element:
515
+ logger.warning(f"[BS4] Could not find body or main content area for parsing {url}")
516
+ return None
517
+ # Extract text, clean up whitespace
518
+ lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
519
+ text = " ".join(lines) # Join lines with single spaces
520
+ text = re.sub(r'\s{2,}', ' ', text).strip() # Consolidate multiple spaces
521
+
522
+ if not text:
523
+ logger.warning(f"[BS4] Extracted text is empty after cleaning for {url}")
524
+ return None
525
  return text
526
+
527
+ # Run parsing in a separate thread
528
+ text_content = await asyncio.to_thread(parse_html, html_content)
529
+
530
+ if text_content:
531
+ logger.info(f"[BS4] Success scrape/parse for {url} (final len: {len(text_content)})")
532
+ return text_content
533
+ else:
534
+ logger.warning(f"[BS4] Parsing resulted in empty content for {url}")
535
+ return None
536
+ except Exception as e:
537
+ logger.error(f"[BS4] Error scraping/parsing {url}: {e}", exc_info=True)
538
+ return None
539
 
540
  async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
541
+ """Fetches website content using urltotext.com API (Fallback 2)."""
542
+ if not url: logger.error("[API] No URL provided"); return None
543
+ if not api_key: logger.error("[API] urltotext.com API key missing."); return None
544
+ logger.info(f"[API] Attempting fetch via urltotext.com for: {url}")
545
+ api_endpoint = "https://urltotext.com/api/v1/urltotext/"
546
+ payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False }
547
+ headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
548
  try:
549
+ async with httpx.AsyncClient(timeout=45.0) as client:
550
+ logger.debug(f"[API] Sending request to urltotext.com API for {url}")
551
+ response = await client.post(api_endpoint, headers=headers, json=payload)
552
+ logger.debug(f"[API] Received status {response.status_code} from urltotext.com API for {url}")
553
+ if response.status_code == 200:
554
+ try:
555
+ data = response.json()
556
+ content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning")
557
+ if warning: logger.warning(f"[API] urltotext.com API Warning for {url}: {warning}")
558
+ if content: logger.info(f"[API] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip()
559
+ else: logger.warning(f"[API] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
560
+ except json.JSONDecodeError: logger.error(f"[API] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
561
+ except Exception as e: logger.error(f"[API] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
562
+ elif response.status_code == 402: logger.error(f"[API] Error 402 (Insufficient Credits) from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
563
+ elif response.status_code == 400 and "url" in response.text.lower(): logger.error(f"[API] Error 400 (Likely Bad URL) from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
564
+ elif response.status_code in [400, 401, 403, 422, 500]: logger.error(f"[API] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
565
+ else: logger.error(f"[API] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
566
+ except httpx.TimeoutException: logger.error(f"[API] Timeout connecting to urltotext.com API for {url}"); return None
567
+ except httpx.RequestError as e: logger.error(f"[API] Request error connecting to urltotext.com API for {url}: {e}"); return None
568
+ except Exception as e: logger.error(f"[API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
569
 
570
  # --- Summarization Functions ---
571
  async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
572
+ """ Calls the Google Gemini API to generate a summary. """
573
  global GEMINI_MODEL, _gemini_primary_enabled
574
+ if not _gemini_primary_enabled: logger.error("[Gemini] Disabled."); return None, "Error: Primary AI service unavailable."
575
+ if len(text) > MAX_INPUT_TOKEN_APPROX: logger.warning(f"[Gemini] Truncating input ({len(text)} > {MAX_INPUT_TOKEN_APPROX})"); text = text[:MAX_INPUT_TOKEN_APPROX]
576
+ logger.info(f"[Gemini] Generating {summary_type} summary using {GEMINI_MODEL}. Input len: {len(text)}")
577
+
578
+ if summary_type == "paragraph":
579
+ prompt = f"""Please summarise the following text into a concise paragraph. Focus on the main points and key information. Avoid unnecessary jargon or overly complex sentences.
580
+
581
+ Text to summarise:
582
+ ---
583
+ {text}
584
+ ---
585
+
586
+ Concise Paragraph Summary:"""
587
+ elif summary_type == "points":
588
+ prompt = f"""Please summarise the following text into a list of key bullet points. Each point should capture a distinct main idea or important piece of information. Aim for clarity and conciseness.
589
+
590
+ Text to summarise:
591
+ ---
592
+ {text}
593
+ ---
594
+
595
+ Key Bullet Points Summary:"""
596
+ else:
597
+ logger.error(f"[Gemini] Invalid summary type: {summary_type}")
598
+ return None, f"Error: Invalid summary type '{summary_type}' specified."
599
+
600
+ # *** MODIFIED: Disable safety settings as requested ***
601
+ safety_settings = { category: HarmBlockThreshold.BLOCK_NONE for category in HarmCategory if hasattr(HarmCategory, category.name) } # Ensure category exists
602
+ logger.info("[Gemini] Safety settings disabled (BLOCK_NONE).")
603
+
604
+ generation_config = genai.types.GenerationConfig( max_output_tokens=2048, temperature=0.7, )
605
+
606
  try:
607
+ model = genai.GenerativeModel(GEMINI_MODEL)
608
+ logger.debug(f"[Gemini] Sending request...")
609
+ response: GenerateContentResponse = await model.generate_content_async(
610
+ prompt,
611
+ generation_config=generation_config,
612
+ safety_settings=safety_settings,
613
+ )
614
+
615
+ # 1. Check prompt feedback for blocks (even with BLOCK_NONE, core harms might trigger)
616
+ if not response.candidates:
617
+ block_reason = "Unknown"
618
+ safety_ratings_str = "N/A"
619
+ if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
620
+ block_reason = str(response.prompt_feedback.block_reason or "Not specified")
621
+ if response.prompt_feedback.safety_ratings:
622
+ safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.prompt_feedback.safety_ratings])
623
+ error_msg = f"Error: Gemini response blocked (Prompt). Reason: {block_reason}. Safety: {safety_ratings_str}"
624
+ logger.error(f"[Gemini] {error_msg}")
625
+ return None, error_msg
626
+
627
+ # 2. Check candidate's finish reason
628
+ candidate = response.candidates[0]
629
+ finish_reason_val = candidate.finish_reason
630
+ finish_reason_str = str(finish_reason_val).upper() if finish_reason_val is not None else "UNSPECIFIED"
631
+ logger.debug(f"[Gemini] Finish reason value: {finish_reason_val} -> {finish_reason_str}")
632
+
633
+ candidate_safety_ratings_str = "N/A"
634
+ if hasattr(candidate, 'safety_ratings') and candidate.safety_ratings:
635
+ candidate_safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in candidate.safety_ratings])
636
+
637
+ # Check if finish reason indicates successful completion (STOP or MAX_TOKENS)
638
+ # Compare the end of the string representation to handle potential enum prefixes
639
+ success = any(finish_reason_str.endswith(reason) for reason in ["STOP", "MAX_TOKENS"])
640
+
641
+ if not success:
642
+ # Treat SAFETY, RECITATION, OTHER, UNSPECIFIED as errors
643
+ error_msg = f"Error: Gemini generation failed or finished unexpectedly. Reason: {finish_reason_str}. Safety: {candidate_safety_ratings_str}"
644
+ logger.error(f"[Gemini] {error_msg}")
645
+ return None, error_msg # Return specific error
646
+
647
+ # Log warning if truncated, but proceed
648
+ if finish_reason_str.endswith("MAX_TOKENS"):
649
+ logger.warning("[Gemini] Output may be truncated due to max_tokens limit.")
650
+
651
+ # 3. Extract text
652
+ summary_text = ""
653
+ extracted = False
654
+ try:
655
+ summary_text = response.text.strip() # Use shortcut if available
656
+ extracted = True
657
+ except Exception as e:
658
+ logger.warning(f"[Gemini] Error accessing response.text: {e}. Trying parts.")
659
+ # Fallback to parts if .text fails
660
+ if candidate.content and candidate.content.parts:
661
+ summary_text = "".join(part.text for part in candidate.content.parts if hasattr(part, "text")).strip()
662
+ extracted = True
663
+
664
+ # Check if text is empty even after successful finish reason
665
+ if not extracted or not summary_text:
666
+ logger.warning(f"[Gemini] Gemini returned empty summary despite finish reason '{finish_reason_str}'.")
667
+ return None, "Error: AI generated an empty summary."
668
+
669
+ logger.info(f"[Gemini] Summary extracted successfully (len: {len(summary_text)}).")
670
+ return summary_text, None
671
+
672
+ except AttributeError as e:
673
+ logger.error(f"[Gemini] Attribute error accessing Gemini response: {e}. Structure might have changed.", exc_info=True)
674
+ return None, f"Error: Failed to parse Gemini response ({e})."
675
+ except Exception as e:
676
+ logger.error(f"[Gemini] Error during API call: {e}", exc_info=True)
677
+ return None, f"Error: Failed communication with Gemini ({e})."
678
 
679
  async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
680
+ """ Calls the OpenRouter API to generate a summary. """
681
  global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
682
  if not _openrouter_fallback_enabled: logger.error("[OR] Disabled."); return None, "Error: Fallback AI unavailable."
683
+ # Truncate if needed
684
+ max_len = 100000
685
+ if len(text) > max_len: logger.warning(f"[OR] Truncating input ({len(text)} > {max_len})"); text = text[:max_len]
686
  logger.info(f"[OR] Generating {summary_type} summary ({OPENROUTER_MODEL}). Input len: {len(text)}")
687
+
688
+ if summary_type == "paragraph":
689
+ prompt_content = f"Please summarise the following text into a concise paragraph...\n\nText:\n---\n{text}\n---\n\nSummary:"
690
+ elif summary_type == "points":
691
+ prompt_content = f"Please summarise the following text into key bullet points...\n\nText:\n---\n{text}\n---\n\nSummary:"
692
+ else:
693
+ logger.error(f"[OR] Invalid type: {summary_type}"); return None, f"Error: Invalid summary type '{summary_type}'."
694
+
695
+ headers = {
696
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
697
+ "Content-Type": "application/json",
698
+ "HTTP-Referer": "https://github.com/fmab777/telegram-summary-bot",
699
+ "X-Title": "Telegram Summary Bot",
700
+ }
701
+ payload = {
702
+ "model": OPENROUTER_MODEL,
703
+ "messages": [
704
+ {"role": "system", "content": "You are an expert summarizer. Provide summaries as requested."},
705
+ {"role": "user", "content": prompt_content}
706
+ ],
707
+ "max_tokens": 2048,
708
+ "temperature": 0.7,
709
+ }
710
+ api_url = "https://openrouter.ai/api/v1/chat/completions"
711
+
712
  try:
713
+ async with httpx.AsyncClient(timeout=120.0) as client:
714
+ logger.debug(f"[OR] Sending request to {api_url}...")
715
+ response = await client.post(api_url, headers=headers, json=payload)
716
+ logger.debug(f"[OR] Received status code {response.status_code}")
717
+
718
+ if response.status_code == 200:
719
+ try:
720
+ data = response.json()
721
+ if data.get("choices") and len(data["choices"]) > 0:
722
+ choice = data["choices"][0]
723
+ message = choice.get("message")
724
+ finish_reason = choice.get("finish_reason", "N/A")
725
+ if message and message.get("content"):
726
+ summary_text = message["content"].strip()
727
+ if summary_text:
728
+ logger.info(f"[OR] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
729
+ if finish_reason == 'length': logger.warning("[OR] Summary may be truncated (max_tokens).")
730
+ return summary_text, None
731
+ else:
732
+ logger.warning(f"[OR] OpenRouter returned empty summary content. Data: {data}")
733
+ return None, "Error: Fallback AI generated empty summary."
734
+ else:
735
+ logger.error(f"[OR] Invalid response structure (missing message/content). Data: {data}")
736
+ return None, "Error: Fallback AI returned invalid response format."
737
+ else:
738
+ logger.error(f"[OR] Invalid response structure (missing choices). Data: {data}")
739
+ api_error = data.get("error", {}).get("message", "Unknown API error")
740
+ return None, f"Error: Fallback AI response missing summary. API msg: {api_error}"
741
+ except json.JSONDecodeError:
742
+ logger.error(f"[OR] Failed to decode JSON response. Status: {response.status_code}, Text: {response.text[:500]}")
743
+ return None, "Error: Fallback AI sent invalid JSON response."
744
+ except Exception as e:
745
+ logger.error(f"[OR] Error processing success response: {e}", exc_info=True)
746
+ return None, f"Error: Failed to process Fallback AI response ({e})."
747
+ else:
748
+ # Handle API errors
749
+ error_message = f"Error: Fallback AI service ({OPENROUTER_MODEL}) returned status {response.status_code}."
750
+ try:
751
+ error_details = response.json().get("error", {}).get("message", response.text[:200])
752
+ error_message += f" Details: {error_details}"
753
+ except Exception:
754
+ error_message += f" Response: {response.text[:200]}"
755
+ logger.error(f"[OR] {error_message}")
756
+ return None, error_message
757
+ except httpx.TimeoutException:
758
+ logger.error(f"[OR] Timeout connecting to OpenRouter API for {OPENROUTER_MODEL}")
759
+ return None, "Error: Timed out connecting to fallback AI."
760
+ except httpx.RequestError as e:
761
+ logger.error(f"[OR] Request error connecting to OpenRouter API: {e}")
762
+ return None, f"Error: Network error connecting to fallback AI ({e})."
763
+ except Exception as e:
764
+ logger.error(f"[OR] Unexpected error during OpenRouter API call: {e}", exc_info=True)
765
+ return None, f"Error: Unexpected issue with fallback AI ({e})."
766
 
767
  async def generate_summary(text: str, summary_type: str) -> str:
768
+ """ Generates a summary using primary (Gemini) then fallback (OpenRouter)."""
769
  global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
770
+ logger.info(f"[Summary] Starting process...")
771
+ error_message: Optional[str] = None
772
+
773
+ # Try Gemini
774
+ if _gemini_primary_enabled:
775
+ logger.info(f"[Summary] Attempting primary AI: Gemini ({GEMINI_MODEL})")
776
+ primary_summary, primary_error = await _call_gemini(text, summary_type)
777
+ if primary_summary:
778
+ logger.info("[Summary] Success with primary AI (Gemini).")
779
+ return primary_summary
780
+ else:
781
+ logger.warning(f"[Summary] Primary AI (Gemini) failed: {primary_error}. Falling back.")
782
+ error_message = f"Primary AI failed: {primary_error}"
783
+ else:
784
+ logger.warning("[Summary] Primary AI (Gemini) disabled. Falling back.")
785
+ error_message = "Primary AI unavailable."
786
+
787
+ # Try OpenRouter if Gemini failed or was disabled
788
+ if _openrouter_fallback_enabled:
789
+ logger.info(f"[Summary] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})")
790
+ fallback_summary, fallback_error = await _call_openrouter(text, summary_type)
791
+ if fallback_summary:
792
+ logger.info("[Summary] Success with fallback AI (OpenRouter).")
793
+ return fallback_summary
794
+ else:
795
+ logger.error(f"[Summary] Fallback AI (OpenRouter) also failed: {fallback_error}")
796
+ # Combine errors for the final message
797
+ if error_message:
798
+ return f"{error_message}\nFallback failed: {fallback_error}"
799
+ else: # Should only happen if Gemini was disabled and OR failed
800
+ return f"Fallback AI failed: {fallback_error}"
801
+ else:
802
+ # OpenRouter is disabled
803
+ logger.error("[Summary] Fallback AI (OpenRouter) is disabled.")
804
+ if error_message: # Primary failed AND fallback disabled
805
+ return f"{error_message}\nFallback unavailable."
806
+ else: # Primary disabled AND fallback disabled
807
+ return "Error: Both primary and fallback AI services are unavailable."
808
+
809
+ # Should not be reached if logic is correct
810
+ logger.error("[Summary] Reached end of function unexpectedly. No summary generated.")
811
+ final_error = error_message or "Unknown summary generation error."
812
+ return f"Sorry, an error occurred: {final_error}"
813
 
814
  # --- Main Processing Task ---
815
  async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
816
+ task_id = f"{user_id}-{message_id_to_edit or 'new'}"
817
+ logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
818
+ bot: Optional[Bot] = None
819
+ content: Optional[str] = None
820
+ feedback: Optional[str] = None # Use 'feedback' for final user message (error or result part 1)
821
+ success = False
822
+ original_msg_id = message_id_to_edit
823
+
824
+ try:
825
+ # Initialize background bot
826
+ bg_request = HTTPXRequest(connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0)
827
+ bot = Bot(token=bot_token, request=bg_request)
828
+ except Exception as e:
829
+ logger.critical(f"[Task {task_id}] Failed background bot init: {e}", exc_info=True)
830
+ return # Cannot proceed
831
+
832
+ try:
833
+ # Edit original message to "Processing..."
834
+ proc_text = f"Generating '{summary_type}' summary...\nThis might take a moment..."
835
  if original_msg_id:
836
+ try:
837
+ await retry_bot_operation(
838
+ bot.edit_message_text,
839
+ chat_id=chat_id,
840
+ message_id=original_msg_id,
841
+ text=proc_text,
842
+ parse_mode=ParseMode.HTML, # Keep HTML for potential future formatting
843
+ reply_markup=None,
844
+ link_preview_options={'is_disabled': True}
845
+ )
846
+ logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id} to 'Processing'")
847
+ except Exception as e:
848
+ logger.warning(f"[Task {task_id}] Failed edit original msg {original_msg_id}: {e}.")
849
+ # Continue anyway, will just send result as new message later
850
+
851
+ # Indicate activity
852
  try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
853
  except Exception: pass
854
 
855
+ # --- Get Content ---
856
+ is_yt = is_youtube_url(url)
857
+ logger.debug(f"[Task {task_id}] URL Type: {'YT' if is_yt else 'Web'}")
858
+ if is_yt:
859
  vid = extract_youtube_id(url)
860
  if vid: content = await get_youtube_transcript(vid, url)
861
+ else: feedback = "Invalid YouTube URL format."
862
+ if not content and not feedback: feedback = "Could not get YouTube transcript (unavailable/no captions?)."
863
+ else: # Website
864
+ logger.info(f"[Task {task_id}] Trying Crawl4AI...")
865
+ content = await get_website_content_via_crawl4ai(url)
866
+ if not content:
867
+ logger.warning(f"[Task {task_id}] Crawl4AI failed. Trying BS4...")
868
+ try: await retry_bot_operation(bot.send_chat_action, chat_id, action='typing'); except Exception: pass
869
+ content = await get_website_content_bs4(url)
870
+ if not content:
871
+ logger.warning(f"[Task {task_id}] BS4 failed. Trying API...")
872
+ global URLTOTEXT_API_KEY, _urltotext_fallback_enabled
873
+ if _urltotext_fallback_enabled:
874
+ try: await retry_bot_operation(bot.send_chat_action, chat_id, action='typing'); except Exception: pass
875
+ content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
876
+ if not content: feedback = "Fetch failed (Crawl4AI/BS4/API error or credits)."
877
+ else:
878
+ feedback = "Fetch failed (Crawl4AI/BS4 error, API disabled)."
879
+ # Final check if web content fetch failed
880
+ if not content and not feedback:
881
+ feedback = "Could not fetch website content using any method."
882
+
883
+ # --- Generate Summary ---
884
  if content and not feedback:
885
+ logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary...")
886
+ try: await retry_bot_operation(bot.send_chat_action, chat_id, action='typing'); except Exception: pass
887
  final_summary = await generate_summary(content, summary_type)
888
+
889
+ if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
890
+ feedback = final_summary # Use AI error as user feedback
891
+ logger.warning(f"[Task {task_id}] Summary generation failed: {feedback}")
892
+ success = False
893
+ else:
894
+ # Success - Split into parts
895
+ summary_parts = []
896
+ current_part = ""; lines = final_summary.splitlines(keepends=True)
897
  for line in lines:
898
+ if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
899
+ if current_part.strip(): summary_parts.append(current_part.strip())
900
+ current_part = line[:MAX_SUMMARY_CHUNK_SIZE] if len(line) > MAX_SUMMARY_CHUNK_SIZE else line
901
+ else: current_part += line
902
+ if current_part.strip(): summary_parts.append(current_part.strip())
903
+ if not summary_parts: parts.append("Summary generated, but empty."); logger.warning(...)
904
+
905
+ logger.info(f"[Task {task_id}] Summary OK (len: {len(final_summary)}). Sending {len(summary_parts)} part(s).")
906
+ feedback = summary_parts[0] # First part becomes the feedback message
907
+ success = True # Assume success initially for sending parts
908
+
909
+ # Send remaining parts if any
910
+ if len(summary_parts) > 1:
911
+ for i, part in enumerate(summary_parts[1:], 2):
912
+ await asyncio.sleep(0.5)
913
+ try:
914
+ await retry_bot_operation(
915
+ bot.send_message,
916
+ chat_id=chat_id, text=part, parse_mode=None,
917
+ link_preview_options={'is_disabled': True}
918
+ )
919
+ logger.debug(f"[Task {task_id}] Sent part {i}/{len(summary_parts)}.")
920
+ except Exception as part_err:
921
+ # If sending a later part fails, update feedback and mark as overall failure
922
+ feedback = f"Sent part 1, but failed to send part {i}. Error: {part_err}"
923
+ success = False
924
+ logger.error(f"[Task {task_id}] {feedback}")
925
+ break # Stop sending
926
+
927
+ # --- Send Final Feedback/Result ---
928
+ if feedback: # If feedback is set (either error or first summary part)
929
+ final_text = feedback
930
+ logger.info(f"[Task {task_id}] Sending final message (Success: {success}).")
931
+ try:
932
+ # Try to edit the original message first
933
+ edited = False
934
+ if original_msg_id:
935
+ try:
936
+ await retry_bot_operation(
937
+ bot.edit_message_text,
938
+ chat_id=chat_id, message_id=original_msg_id, text=final_text,
939
+ parse_mode=None, # Use None for safety, assuming plain text output from AI
940
+ reply_markup=None, link_preview_options={'is_disabled': True}
941
+ )
942
+ logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id} with final content.")
943
+ edited = True
944
+ except Exception as edit_err:
945
+ logger.warning(f"[Task {task_id}] Failed final edit original msg {original_msg_id}: {edit_err}. Sending new.")
946
+
947
+ # If editing failed or wasn't applicable, send as new message
948
+ if not edited:
949
+ await retry_bot_operation(
950
+ bot.send_message,
951
+ chat_id=chat_id, text=final_text, parse_mode=None,
952
+ link_preview_options={'is_disabled': True}
953
+ )
954
+ logger.debug(f"[Task {task_id}] Sent final content as new message.")
955
+ except Exception as send_err:
956
+ # Log failure to send even the final message
957
+ logger.error(f"[Task {task_id}] CRITICAL: Failed to send final feedback/result message: {send_err}")
958
+ success = False # Mark failure if we couldn't even send the result/error
959
+ else:
960
+ # This case should ideally not happen if logic is correct
961
+ logger.error(f"[Task {task_id}] No feedback message set at end of task (Success: {success}).")
962
+ # Optionally send a generic error if success is False here
963
+ if not success:
964
+ try: await retry_bot_operation(bot.send_message, chat_id, text="An unknown error occurred processing the request.")
965
+ except Exception: pass
966
+
967
+
968
+ except Exception as e: # Catch-all for unexpected task errors
969
+ logger.error(f"[Task {task_id}] Unexpected error during processing task: {e}", exc_info=True)
970
+ success = False
971
+ feedback = "Oops! An unexpected error occurred. Please try again later."
972
+ # Try to send this final crash feedback
973
+ if bot:
974
+ try:
975
+ edited_crash = False
976
+ if original_msg_id:
977
+ try:
978
+ await retry_bot_operation( bot.edit_message_text, chat_id, original_msg_id, text=feedback, reply_markup=None, link_preview_options={'is_disabled': True} )
979
+ edited_crash = True
980
+ except Exception: pass
981
+ if not edited_crash:
982
+ await retry_bot_operation( bot.send_message, chat_id, text=feedback )
983
+ except Exception as final_err:
984
+ logger.error(f"[Task {task_id}] Failed to send final CRASH error feedback: {final_err}")
985
+
986
  finally: # Cleanup
987
+ # Close background bot client
988
+ if bot and bot.request and hasattr(bot.request, '_client') and bot.request._client:
989
+ try: await bot.request._client.aclose(); logger.debug(f"[Task {task_id}] BG client closed.")
990
+ except Exception as e: logger.warning(f"[Task {task_id}] Error closing BG client: {e}")
991
+ logger.info(f"[Task {task_id}] Task completed. Overall Success: {success}")
992
+
993
 
994
  # --- Telegram Handlers ---
995
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
1014
  if not update.message or not update.message.text: return
1015
  url = update.message.text.strip(); user = update.effective_user;
1016
  if not user: return
1017
+ if not re.match(r'https?://[^\s/$.?#].[^\s]*', url, re.I):
1018
+ logger.debug(f"Ignoring non-URL from {user.id}: {url}")
1019
+ await update.message.reply_text("Invalid URL format. Use http(s)://..."); return
1020
  logger.info(f"User {user.id} sent URL: {url}")
1021
+ context.user_data['url_to_summarize'] = url
1022
+ context.user_data['original_message_id'] = update.message.message_id
1023
+ keys = [[ InlineKeyboardButton("Paragraph", callback_data="paragraph"), InlineKeyboardButton("Points", callback_data="points") ]]
1024
  markup = InlineKeyboardMarkup(keys)
1025
  await update.message.reply_html( f"Link:\n<code>{html.escape(url)}</code>\n\nSummary type?", reply_markup=markup, disable_web_page_preview=True )
1026
 
 
1030
  user = query.from_user; summary_type = query.data; qid = query.id
1031
  try: await query.answer(); logger.debug(f"Ack cb {qid} from {user.id}")
1032
  except Exception as e: logger.warning(f"Err answering cb {qid}: {e}")
1033
+
1034
  url = context.user_data.get('url_to_summarize'); msg_id = query.message.message_id
1035
  logger.info(f"User {user.id} chose '{summary_type}' for msg {msg_id}. URL context: {'Yes' if url else 'No'}")
1036
+ if not url:
1037
+ logger.warning(f"No URL context user {user.id} (cb {qid}).");
1038
+ try: await query.edit_message_text("Context lost. Please send URL again.", reply_markup=None)
1039
+ except Exception as e: logger.error(f"Failed edit 'URL not found': {e}");
1040
+ return
1041
 
1042
  global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
1043
+ # Check configs
1044
  if not TELEGRAM_TOKEN:
1045
  logger.critical("TG TOKEN missing!")
1046
  try: await query.edit_message_text("❌ Bot config error (Token).", reply_markup=None)
 
1054
  if not _gemini_primary_enabled: logger.warning("Primary AI unavailable, using fallback.")
1055
  elif not _openrouter_fallback_enabled: logger.warning("Fallback AI unavailable, using primary.")
1056
 
1057
+ # Schedule task
1058
  logger.info(f"Scheduling task user {user.id}...")
1059
  asyncio.ensure_future( process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=msg_id, url=url, summary_type=summary_type, bot_token=TELEGRAM_TOKEN ) )
1060
+ # Clear context after scheduling
1061
  context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None)
1062
  logger.debug(f"Cleared context user {user.id} post-schedule.")
1063
 
1064
  async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
1065
  ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter)
1066
+ if isinstance(context.error, ignore_errors):
1067
+ ignore_messages = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
1068
  err_str = str(context.error).lower()
1069
  if any(msg in err_str for msg in ignore_messages) or isinstance(context.error, (TimedOut, NetworkError, RetryAfter)):
1070
+ logger.warning(f"Ignoring known/transient error: {context.error}")
1071
+ return
1072
  logger.error("Exception handling update:", exc_info=context.error)
1073
 
1074
  # --- Application Setup ---
 
1075
  async def setup_bot_config() -> Application:
1076
  logger.info("Configuring Telegram App..."); global TELEGRAM_TOKEN
1077
  if not TELEGRAM_TOKEN: raise ValueError("TG TOKEN missing.")
1078
+ req = HTTPXRequest(connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0)
1079
+ app = ( Application.builder().token(TELEGRAM_TOKEN).request(req).build() )
1080
+ app.add_handler(CommandHandler("start", start))
1081
+ app.add_handler(CommandHandler("help", help_command))
1082
  url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
1083
+ app.add_handler(MessageHandler(url_filter, handle_potential_url))
1084
+ app.add_handler(CallbackQueryHandler(handle_summary_type_callback))
1085
+ app.add_error_handler(error_handler)
1086
+ logger.info("TG handlers configured."); return app
1087
 
1088
+ # --- ASGI Lifespan & Routes ---
1089
  @contextlib.asynccontextmanager
1090
  async def lifespan(app: Starlette):
1091
+ global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN; logger.info("Lifespan: Startup...");
1092
+ if not TELEGRAM_TOKEN: raise RuntimeError("Telegram token missing.")
1093
  try:
1094
+ ptb_app = await setup_bot_config(); await ptb_app.initialize()
1095
+ bot_info = await ptb_app.bot.get_me(); logger.info(f"Bot init: @{bot_info.username} ({bot_info.id})")
1096
+ # Webhook setup
1097
  current_info = await ptb_app.bot.get_webhook_info(); deleted_ok = True
1098
+ if current_info and current_info.url:
1099
+ logger.info(f"Deleting existing webhook: {current_info.url}...")
1100
+ try: deleted_ok = await ptb_app.bot.delete_webhook(drop_pending_updates=True); logger.info("WH deleted." if deleted_ok else "WH delete failed.")
1101
+ except Exception as e: logger.warning(f"WH delete error: {e}"); deleted_ok = False; await asyncio.sleep(1)
1102
+
1103
  host = os.environ.get("SPACE_HOST"); path="/webhook";
1104
  if not host: raise RuntimeError("SPACE_HOST missing.")
1105
  wh_url = f"https://{host.split('://')[-1].rstrip('/')}{path}"
1106
+
1107
  if wh_url and deleted_ok:
1108
+ logger.info(f"Setting webhook: {wh_url}")
1109
+ args = { "url": wh_url, "allowed_updates": Update.ALL_TYPES, "drop_pending_updates": True }
1110
+ if WEBHOOK_SECRET: args["secret_token"] = WEBHOOK_SECRET; logger.info("Using webhook secret.")
1111
+ await asyncio.sleep(1.0)
1112
  try:
1113
+ if not await ptb_app.bot.set_webhook(**args): raise RuntimeError("set_webhook returned False.")
1114
  await asyncio.sleep(1.5); info = await ptb_app.bot.get_webhook_info()
1115
  if not (info and info.url == wh_url): raise RuntimeError(f"WH verify fail! Expected '{wh_url}', Got: {info}")
1116
+ logger.info(f"WH set & verified: URL='{info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}"); if info.last_error_message: logger.warning(f"WH status error: {info.last_error_message}")
1117
  await ptb_app.start(); logger.info("PTB started (webhook).")
1118
  except Exception as e: logger.error(f"FATAL: WH setup error: {e}", exc_info=True); raise RuntimeError(f"WH setup fail: {e}") from e
1119
+ elif not deleted_ok: raise RuntimeError("Failed to delete previous webhook.")
1120
+
1121
  logger.info("Lifespan: Startup complete."); yield
1122
+ except Exception as startup_err: logger.critical(f"Startup failed: {startup_err}", exc_info=True); raise # Reraise to stop Gunicorn
1123
  finally: # Shutdown
1124
  logger.info("Lifespan: Shutdown...");
1125
+ if ptb_app:
1126
+ try:
1127
+ if ptb_app.running: await ptb_app.stop(); logger.info("PTB stopped.")
1128
+ if ptb_app._initialized: await ptb_app.shutdown(); logger.info("PTB shutdown.")
1129
+ except Exception as e: logger.error(f"Error during PTB shutdown: {e}", exc_info=True)
1130
  logger.info("Lifespan: Shutdown complete.")
1131
 
1132
  async def health_check(request: Request) -> PlainTextResponse:
1133
+ global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
1134
+ bot_status = "Not Initialized"; bot_username = "N/A"
1135
  if ptb_app and ptb_app.bot and ptb_app._initialized:
1136
+ try:
1137
+ wh_info = await ptb_app.bot.get_webhook_info()
1138
+ if ptb_app.running and wh_info and wh_info.url:
1139
+ bot_info = await ptb_app.bot.get_me(); bot_username = f"@{bot_info.username}"
1140
+ bot_status = f"Running (Webhook OK, {bot_username})"
1141
+ elif ptb_app.running:
1142
+ bot_status = f"Running (Webhook Status: {wh_info.url if wh_info else 'N/A'}, Last Error: {wh_info.last_error_message if wh_info else 'N/A'})"
1143
+ else: bot_status = "Initialized/Not running"
1144
+ except Exception as e:
1145
+ logger.error(f"Health check status error: {e}", exc_info=True); bot_status = f"Error checking status: {e}"
1146
  elif ptb_app: bot_status = "Initializing..."
1147
+
1148
+ health_info = [
1149
+ f"=== Bot Status ===", f"Application: {bot_status}", "--- Services ---",
1150
+ f"Web Scraper 1 (Primary): {'Crawl4AI (ignore_robots)' if _crawl4ai_primary_web_enabled else 'DISABLED'}",
1151
+ f"Web Scraper 2 (Fallback): BeautifulSoup",
1152
+ f"Web Scraper 3 (Fallback): {'urltotext.com API' if _urltotext_fallback_enabled else 'DISABLED'}",
1153
+ f"Summarizer 1 (Primary): {'Gemini (' + GEMINI_MODEL + ')' if _gemini_primary_enabled else 'DISABLED'}",
1154
+ f"Summarizer 2 (Fallback): {'OpenRouter (' + OPENROUTER_MODEL + ')' if _openrouter_fallback_enabled else 'DISABLED'}",
1155
+ f"YT Transcript 1 (Primary): youtube-transcript-api",
1156
+ f"YT Transcript 2 (Fallback): {'Supadata API' if SUPADATA_API_KEY else 'DISABLED'}",
1157
+ f"YT Transcript 3 (Fallback): {'Apify (' + APIFY_ACTOR_ID + ')' if _apify_token_exists else 'DISABLED'}"
1158
+ ]
1159
+ return PlainTextResponse("\n".join(health_info))
1160
 
1161
  async def telegram_webhook(request: Request) -> Response:
1162
+ global WEBHOOK_SECRET, ptb_app
1163
+ # Check if app is ready
1164
+ if not ptb_app or not ptb_app._initialized or not ptb_app.running:
1165
+ status = "Not Initialized" if not ptb_app else ("Initializing" if not ptb_app._initialized else "Not Running")
1166
+ logger.error(f"Webhook received but PTB application {status}.")
1167
+ return PlainTextResponse(f'Bot {status}', status_code=503)
1168
+
1169
+ # Validate secret
1170
+ if WEBHOOK_SECRET:
1171
+ token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
1172
+ if token_header != WEBHOOK_SECRET:
1173
+ logger.warning("Webhook received with invalid secret token.")
1174
+ return Response(content="Invalid secret token", status_code=403)
1175
+
1176
+ # Process update
1177
+ try:
1178
+ update_data = await request.json()
1179
+ update = Update.de_json(data=update_data, bot=ptb_app.bot)
1180
+ logger.debug(f"Processing update_id: {update.update_id} via webhook")
1181
+ await ptb_app.process_update(update)
1182
+ return Response(status_code=200) # OK to Telegram
1183
+ except json.JSONDecodeError:
1184
+ logger.error("Webhook received invalid JSON.")
1185
+ return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
1186
+ except Exception as e:
1187
+ logger.error(f"Error processing webhook update: {e}", exc_info=True)
1188
+ return Response(status_code=200) # Still return OK to TG to prevent retries for processing errors
1189
 
1190
  # --- ASGI App Definition ---
1191
+ app = Starlette(
1192
+ debug=False,
1193
+ lifespan=lifespan,
1194
+ routes=[
1195
+ Route("/", endpoint=health_check, methods=["GET"]),
1196
+ Route("/webhook", endpoint=telegram_webhook, methods=["POST"]),
1197
+ ]
1198
+ )
1199
  logger.info("Starlette ASGI application created.")
1200
 
1201
  # --- Development Runner ---
1202
  if __name__ == '__main__':
1203
+ import uvicorn
1204
+ logger.warning("Running DEV mode - FOR LOCAL TESTING ONLY")
1205
+ log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
1206
+ port = int(os.environ.get('PORT', 8080))
1207
+ try:
1208
+ from dotenv import load_dotenv
1209
+ load_dotenv()
1210
+ logger.info(".env file loaded for local development.")
1211
+ except ImportError:
1212
+ logger.info(".env file not found or python-dotenv not installed.")
1213
+ # Check required secrets for local dev
1214
+ if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN missing.")
1215
  if not get_secret('GEMINI_API_KEY'): logger.error("Local Dev: GEMINI_API_KEY missing.")
1216
  uvicorn.run( "main:app", host='0.0.0.0', port=port, log_level=log_level, reload=True )