fmab777 commited on
Commit
b1bdfa0
·
verified ·
1 Parent(s): c261e5f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +109 -162
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Corrected SyntaxError at lines 1097 & 1101 - Now with Crawl4AI as Primary Scraper)
2
  import os
3
  import re
4
  import logging
@@ -90,6 +90,18 @@ if not _crawl4ai_available: logger.warning("crawl4ai library not found. Primary
90
  # --- Global variable for PTB app ---
91
  ptb_app: Optional[Application] = None
92
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # --- Environment Variable Loading & Configuration ---
94
  logger.info("Attempting to load secrets and configuration...")
95
  def get_secret(secret_name):
@@ -219,20 +231,11 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
219
 
220
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
221
  params = {"token": api_token}
222
- # Input specific to karamelo~youtube-transcripts actor
223
- payload = {
224
- "urls": [video_url],
225
- "outputFormat": "singleStringText",
226
- "maxRetries": 5,
227
- "channelHandleBoolean": False,
228
- "channelNameBoolean": False,
229
- "datePublishedBoolean": False,
230
- "relativeDateTextBoolean": False,
231
- }
232
  headers = {"Content-Type": "application/json"}
233
 
234
  try:
235
- async with httpx.AsyncClient(timeout=120.0) as client: # Long timeout for potential YT processing
236
  logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
237
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
238
  logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
@@ -241,13 +244,11 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
241
  try:
242
  results = response.json()
243
  if isinstance(results, list) and len(results) > 0:
244
- item = results[0]
245
- content = None
246
- # Check common keys for transcript text
247
  if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
248
  elif "text" in item and isinstance(item["text"], str): content = item["text"]
249
  elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
250
- elif "captions" in item and isinstance(item["captions"], list): # Handle list format if needed
251
  if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
252
  elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
253
 
@@ -266,11 +267,10 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
266
  except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
267
 
268
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
269
- global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists # Added _apify_token_exists global ref
270
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
271
  logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
272
  transcript_text = None
273
- # Method 1: youtube-transcript-api (Primary)
274
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
275
  try:
276
  transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
@@ -281,7 +281,6 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
281
  except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
282
  except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
283
 
284
- # Method 2: Supadata (Fallback 1)
285
  if transcript_text is None:
286
  logger.info("[Fallback YT 1] Trying Supadata API...")
287
  if SUPADATA_API_KEY:
@@ -290,16 +289,14 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
290
  else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
291
  else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
292
 
293
- # Method 3: Apify (Fallback 2 - Default YT Actor)
294
  if transcript_text is None:
295
  logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
296
- if _apify_token_exists: # Use the global flag
297
  transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
298
  if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
299
  else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
300
  else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
301
 
302
- # Final Result
303
  if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
304
  return transcript_text
305
 
@@ -308,7 +305,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
308
  # --- NEW Primary Method: Crawl4AI ---
309
  async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
310
  """Primary Web Method: Fetches and extracts content using Crawl4AI."""
311
- global _crawl4ai_primary_scrape_enabled
312
  if not _crawl4ai_primary_scrape_enabled:
313
  logger.warning("[Crawl4AI Primary] Called but library is unavailable.")
314
  return None
@@ -316,52 +313,45 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
316
  logger.info(f"[Crawl4AI Primary] Attempting fetch and extraction for: {url}")
317
 
318
  run_config = CrawlerRunConfig(
319
- cache_mode=CacheMode.BYPASS,
320
  page_timeout=60000, # 60 sec timeout
321
  verbose=False,
322
- # Rely on default markdown strategy for now, prioritize 'fit_markdown' if available
323
- # Consider adding 'wait_for' or 'scan_full_page=True' if needed for dynamic sites
324
  # Consider 'remove_overlay_elements=True' for cookie banners/popups
325
  )
326
- # Default BrowserConfig is headless chromium, which is usually fine
 
327
  # browser_config = BrowserConfig(headless=True, verbose=False)
328
 
329
  extracted_text: Optional[str] = None
330
  try:
331
- # Use context manager: async with AsyncWebCrawler(config=browser_config) as crawler:
332
- async with AsyncWebCrawler() as crawler:
 
333
  logger.debug(f"[Crawl4AI Primary] Calling crawler.arun for {url}")
334
  result: CrawlResult = await crawler.arun(url=url, config=run_config)
335
  logger.debug(f"[Crawl4AI Primary] arun completed. Success: {result.success}, Status: {result.status_code}")
336
 
337
  if result.success:
338
  if result.markdown and isinstance(result.markdown, MarkdownGenerationResult):
339
- # Prefer fit_markdown if it exists and has content
340
  if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 30:
341
  extracted_text = result.markdown.fit_markdown.strip()
342
  logger.debug(f"[Crawl4AI Primary] Using fit_markdown for {url}")
343
- # Fallback to raw_markdown
344
  elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str):
345
  extracted_text = result.markdown.raw_markdown.strip()
346
  logger.debug(f"[Crawl4AI Primary] Using raw_markdown (fit_markdown unavailable/short) for {url}")
347
- else:
348
- logger.warning(f"[Crawl4AI Primary] Markdown object present but no usable text content for {url}")
349
- # Legacy/Alternative checks (less likely with v0.5+)
350
  elif result.markdown and isinstance(result.markdown, str):
351
  extracted_text = result.markdown.strip()
352
  logger.debug(f"[Crawl4AI Primary] Using direct result.markdown string for {url}")
353
- elif result.cleaned_html: # Last resort: parse cleaned HTML
354
  logger.warning(f"[Crawl4AI Primary] No markdown found, parsing cleaned_html with BS4 for {url}")
355
  try:
356
  soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER)
357
  extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
358
- except Exception as bs_err:
359
- logger.error(f"[Crawl4AI Primary] Error parsing cleaned_html with BS4 for {url}: {bs_err}")
360
- extracted_text = None
361
- else:
362
- logger.warning(f"[Crawl4AI Primary] Crawl success but no markdown or cleaned_html found for {url}")
363
 
364
- # Final length check
365
  if extracted_text and len(extracted_text) > 30:
366
  logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}")
367
  return extracted_text
@@ -377,19 +367,17 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
377
  except asyncio.TimeoutError:
378
  logger.error(f"[Crawl4AI Primary] Timeout error during crawl for {url}")
379
  return None
380
- except ImportError as ie: # Catch if playwright drivers aren't installed
381
  if "playwright" in str(ie).lower():
382
  logger.critical(f"[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment. Error: {ie}")
383
- _crawl4ai_primary_scrape_enabled = False # Disable for future calls in this run
384
- else:
385
- logger.error(f"[Crawl4AI Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True)
386
  return None
387
  except Exception as e:
388
  logger.error(f"[Crawl4AI Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True)
389
- # Check if it's a playwright installation issue
390
  if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower()):
391
  logger.critical("[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment.")
392
- _crawl4ai_primary_scrape_enabled = False # Disable for future calls
393
  return None
394
 
395
 
@@ -402,19 +390,14 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
402
  logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}")
403
  response = await client.get(url)
404
  logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}")
405
- response.raise_for_status() # Raise HTTPStatusError for 4xx/5xx
406
  content_type = response.headers.get('content-type', '').lower()
407
- if 'html' not in content_type and 'xml' not in content_type: # Allow xml just in case
408
  logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}")
409
- if 'text/plain' in content_type:
410
- logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, attempting to read.")
411
- return response.text # Return plain text directly
412
- return None # Skip other non-html types
413
- try:
414
- return response.text # Attempt to decode text, handle potential errors
415
- except Exception as e:
416
- logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}")
417
  return None
 
 
418
  except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}")
419
  except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}")
420
  except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}")
@@ -427,51 +410,28 @@ async def get_website_content_direct_bs4(url: str) -> Optional[str]:
427
  if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None
428
  logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}")
429
  html_content = await fetch_url_content_for_scrape(url)
430
- if not html_content:
431
- logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}.")
432
- return None
433
  try:
434
- # --- Parsing logic (run in thread to avoid blocking) ---
435
  def parse_html(content: str) -> Optional[str]:
436
  try:
437
  soup = BeautifulSoup(content, DEFAULT_PARSER)
438
- for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]):
439
- element.extract()
440
- main_content = soup.find('main') or \
441
- soup.find('article') or \
442
- soup.find(role='main') or \
443
- soup.find(id=re.compile(r'content|main|body', re.I)) or \
444
- soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
445
  target_element = main_content if main_content else soup.body
446
  if not target_element:
447
- logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content container for {url}")
448
  text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
449
- if text_from_root and len(text_from_root) > 50:
450
- logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}.")
451
- return text_from_root
452
  return None
453
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
454
  text = " ".join(lines)
455
- if not text or len(text) < 50: # Adjust threshold as needed
456
- logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text seems too short or empty after cleaning for {url}. Length: {len(text)}")
457
- return None
458
  return text
459
- except Exception as parse_e:
460
- logger.error(f"[Web Scrape Fallback 1 Parse] Error during BeautifulSoup parsing for {url}: {parse_e}", exc_info=False) # Keep log cleaner
461
- return None
462
- # --- End parsing logic ---
463
-
464
  text_content = await asyncio.to_thread(parse_html, html_content)
465
-
466
- if text_content:
467
- logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (final len: {len(text_content)})")
468
- return text_content
469
- else:
470
- logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no content for {url}.")
471
- return None
472
- except Exception as e:
473
- logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing process for {url}: {e}", exc_info=True)
474
- return None
475
 
476
  # --- Fallback 2: urltotext.com API ---
477
  async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
@@ -510,7 +470,7 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
510
  if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
511
  logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API")
512
  api_host = "scrapers-proxy2.p.rapidapi.com"
513
- encoded_url = urllib.parse.quote(url, safe='') # URL Encode the target URL
514
  api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
515
  headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
516
  try:
@@ -524,16 +484,12 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
524
  content = data.get("content"); title = data.get("title"); extracted_text = ""
525
  if title and isinstance(title, str): extracted_text += title.strip() + ". "
526
  if content and isinstance(content, str): extracted_text += content.strip()
527
- if extracted_text and len(extracted_text) > 30:
528
- logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy Parser API for {url}. Len: {len(extracted_text)}")
529
- return extracted_text
530
- else:
531
- logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title seems empty or too short for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}")
532
- return None
533
  except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
534
  except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
535
  elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
536
- elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
537
  elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
538
  elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
539
  else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
@@ -568,11 +524,11 @@ async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Opti
568
  return None
569
  except json.JSONDecodeError:
570
  raw_text = response.text
571
- if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text content. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}"); return raw_text.strip()
572
  else: logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None
573
  except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None
574
  elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None
575
- elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
576
  elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None
577
  elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
578
  else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
@@ -608,7 +564,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
608
  elif "content" in item and isinstance(item["content"], str): content = item["content"]
609
  elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"]
610
  elif "html" in item and isinstance(item["html"], str):
611
- logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text' or 'markdown' found, attempting to parse 'html'.")
612
  soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
613
  content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
614
 
@@ -685,7 +641,7 @@ async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Opt
685
  "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
686
  "Here is the text to summarise:")
687
 
688
- MAX_INPUT_LENGTH_GEMINI = 900000 # Check model docs for actual limit
689
  if len(text) > MAX_INPUT_LENGTH_GEMINI:
690
  logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
691
  text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
@@ -770,7 +726,7 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
770
  "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
771
  "Here is the text to summarise:")
772
 
773
- MAX_INPUT_LENGTH_OR = 100000 # Conservative limit
774
  if len(text) > MAX_INPUT_LENGTH_OR:
775
  logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}). Truncating.");
776
  text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
@@ -862,7 +818,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
862
 
863
  try:
864
  # --- 1. Initial User Feedback ---
865
- processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... might take a moment)..." # Updated text
866
  if status_message_id:
867
  try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
868
  except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None
@@ -879,33 +835,30 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
879
  is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
880
 
881
  if is_youtube:
882
- # --- YouTube Transcript Logic (Unchanged) ---
883
  video_id = extract_youtube_id(url)
884
- if video_id: content = await get_youtube_transcript(video_id, url) # Tries lib -> Supadata -> Apify YT Actor
885
  else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
886
  if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
887
  else:
888
  # --- Website Scraping Logic (NEW Order: Crawl4AI -> Direct+BS4 -> APIs -> Apify) ---
889
- global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN
890
- global _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists
891
- global _crawl4ai_primary_scrape_enabled # Ensure global access
892
 
893
- # Method 0: Primary Scrape (Crawl4AI - NEW)
894
  logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Crawl4AI)...")
895
  if _crawl4ai_primary_scrape_enabled:
896
  content = await get_website_content_via_crawl4ai(url)
897
  if not content: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed.")
898
- else:
899
- logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library unavailable.")
900
 
901
- # Method 1: Fallback 1 (Direct Fetch + BS4 - WAS Primary)
902
  if not content:
903
  logger.warning(f"[Task {task_id}] Method 0 failed. Trying Method 1 (Direct Fetch + BS4)...")
904
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
905
  content = await get_website_content_direct_bs4(url)
906
  if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.")
907
 
908
- # Method 2: Fallback 2 (urltotext.com - WAS Fallback 1)
909
  if not content:
910
  logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
911
  if _urltotext_key_exists:
@@ -914,7 +867,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
914
  if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.")
915
  else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
916
 
917
- # Method 3: Fallback 3 (Scraper's Proxy via RapidAPI - WAS Fallback 2)
918
  if not content:
919
  logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
920
  if _rapidapi_key_exists:
@@ -923,7 +876,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
923
  if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.")
924
  else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
925
 
926
- # Method 4: Fallback 4 (AI Web Scraper via RapidAPI - WAS Fallback 3)
927
  if not content:
928
  logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...")
929
  if _rapidapi_key_exists:
@@ -932,7 +885,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
932
  if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.")
933
  else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
934
 
935
- # Method 5: Fallback 5 (Apify Website Content Crawler - WAS Fallback 4)
936
  if not content:
937
  logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...")
938
  if _apify_token_exists:
@@ -941,7 +894,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
941
  if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.")
942
  else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.")
943
 
944
- # Method 6: Fallback 6 (Apify Text Scraper Free - WAS Fallback 5)
945
  if not content:
946
  logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...")
947
  if _apify_token_exists:
@@ -950,14 +903,13 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
950
  if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.")
951
  else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.")
952
 
953
- # Final check for website content after all methods
954
  if not content and not user_feedback_message:
955
- user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed." # Updated message
956
 
957
  # --- 3. Summarization ---
958
  if content:
959
- logger.info(f"[Task {task_id}] Content fetched successfully (len:{len(content)}). Generating summary.")
960
- # Update status message before summarization
961
  try:
962
  status_update_msg_id = message_to_delete_later_id or status_message_id
963
  if status_update_msg_id:
@@ -965,24 +917,24 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
965
  except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
966
 
967
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
968
- final_summary = await generate_summary(content, summary_type) # Calls Gemini -> OpenRouter
969
 
970
  if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
971
- user_feedback_message = final_summary # Use the error message from summarizer
972
  logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
973
  else:
974
  max_length = 4096
975
  summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
976
  await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
977
  for part in summary_parts[1:]:
978
- await asyncio.sleep(0.5) # Small delay between parts
979
  await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
980
  success = True
981
  logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
982
- user_feedback_message = None # Clear any previous error message
983
 
984
  # --- 4. Handle Final Failure Feedback ---
985
- if user_feedback_message: # If any step failed and set a message
986
  logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
987
  await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
988
 
@@ -1009,7 +961,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
1009
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
1010
 
1011
 
1012
- # --- Telegram Handlers (Unchanged, except callback syntax fix) ---
1013
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
1014
  user = update.effective_user; mention = user.mention_html()
1015
  if not user or not update.message: return
@@ -1025,7 +977,7 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
1025
  "2. I'll ask how you want it summarised (paragraph or points).\n"
1026
  "3. Click the button for your choice.\n"
1027
  "4. Wait while I fetch the content and generate the summary!\n\n"
1028
- "⚙️ I try multiple methods to get content, especially for tricky websites or YouTube videos without standard transcripts.\n\n"
1029
  "**Commands:**\n"
1030
  "`/start` - Display the welcome message\n"
1031
  "`/help` - Show this help message" )
@@ -1036,9 +988,7 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
1036
  url = update.message.text.strip(); user = update.effective_user
1037
  if not user: return
1038
  url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE)
1039
- if not url_pattern.search(url):
1040
- logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}")
1041
- return
1042
 
1043
  match = url_pattern.search(url)
1044
  if match:
@@ -1051,15 +1001,12 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
1051
  try:
1052
  await update.message.reply_text(
1053
  f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?",
1054
- reply_markup=reply_markup,
1055
- disable_web_page_preview=True,
1056
- parse_mode=ParseMode.MARKDOWN
1057
- )
1058
  except BadRequest as e:
1059
  if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).")
1060
  else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}")
1061
  except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True)
1062
- else: logger.debug(f"Ignoring message from {user.id} that passed initial check but no URL found: {url[:100]}")
1063
 
1064
 
1065
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
@@ -1077,13 +1024,11 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
1077
 
1078
  if not url:
1079
  logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Old button?")
1080
- try:
1081
- await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.")
1082
  except BadRequest as e:
1083
  if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass
1084
  else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
1085
- except Exception as e:
1086
- logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
1087
  return
1088
 
1089
  context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
@@ -1091,19 +1036,13 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
1091
  global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
1092
  if not TELEGRAM_TOKEN:
1093
  logger.critical("TELEGRAM_TOKEN missing in callback!")
1094
- # *** SYNTAX FIX HERE ***
1095
- try:
1096
- await query.edit_message_text(text="❌ Bot config error (Token Missing).")
1097
- except Exception:
1098
- pass # Ignore if editing fails
1099
  return
1100
  if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
1101
  logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!")
1102
- # *** SYNTAX FIX HERE ***
1103
- try:
1104
- await query.edit_message_text(text="❌ AI configuration error: No summarization models available.")
1105
- except Exception:
1106
- pass # Ignore if editing fails
1107
  return
1108
  elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) unavailable, relying on fallback.")
1109
  elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) unavailable, relying on primary.")
@@ -1119,7 +1058,7 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
1119
  logger.error("Exception while handling an update:", exc_info=context.error)
1120
 
1121
 
1122
- # --- Application Setup & Web Framework (MODIFIED Health Check) ---
1123
 
1124
  async def setup_bot_config() -> Application:
1125
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
@@ -1168,14 +1107,14 @@ async def lifespan(app: Starlette):
1168
  await ptb_app.bot.set_webhook(**set_webhook_args)
1169
  webhook_info = await ptb_app.bot.get_webhook_info()
1170
  if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
1171
- else: logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'.")
1172
  await ptb_app.start()
1173
  logger.info("PTB Application started in webhook mode.")
1174
  except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e
1175
  else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL could not be determined.")
1176
  else: logger.critical("SPACE_HOST env var not found."); raise RuntimeError("SPACE_HOST env var missing.")
1177
 
1178
- logger.info("ASGI Lifespan: Startup complete."); yield # --- Application runs here ---
1179
 
1180
  except Exception as startup_err:
1181
  logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
@@ -1200,7 +1139,6 @@ async def lifespan(app: Starlette):
1200
 
1201
  async def health_check(request: Request) -> PlainTextResponse:
1202
  """Simple health check endpoint."""
1203
- # ADDED _crawl4ai_primary_scrape_enabled
1204
  global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
1205
  global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled
1206
  bot_status = "Not Initialized"; bot_username = "N/A"
@@ -1221,7 +1159,6 @@ async def health_check(request: Request) -> PlainTextResponse:
1221
  except Exception as e: bot_status = f"Error checking: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
1222
  else: bot_status = "Not Initialized"; bot_username = "N/A"
1223
 
1224
- # Updated health check output
1225
  return PlainTextResponse(
1226
  f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
1227
  f"--- Summarization ---\n"
@@ -1232,7 +1169,7 @@ async def health_check(request: Request) -> PlainTextResponse:
1232
  f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
1233
  f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1234
  f"--- Website Scraping ---\n"
1235
- f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Library/Driver Missing?'}\n" # Updated message
1236
  f"Fallback 1 (Direct+BS4): Enabled\n"
1237
  f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
1238
  f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
@@ -1259,20 +1196,30 @@ async def telegram_webhook(request: Request) -> Response:
1259
  except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
1260
  except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to prevent TG retries
1261
 
1262
- # --- Starlette App Definition (Unchanged) ---
1263
  app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
1264
  logger.info("Starlette ASGI application created with health check and webhook routes.")
1265
 
1266
- # --- Development Server (Unchanged) ---
1267
  if __name__ == '__main__':
1268
  import uvicorn
1269
  logger.warning("Running in development mode using Uvicorn directly - NOT for production!")
1270
- # One-time check/reminder for Playwright install during local dev
1271
  try:
1272
  from playwright.async_api import async_playwright
1273
  logger.info("Playwright library found.")
1274
- # Could add a check here to see if browsers are actually installed,
1275
- # but 'playwright install' is the general fix.
 
 
 
 
 
 
 
 
 
 
 
1276
  except ImportError:
1277
  logger.critical("Playwright library not found. Crawl4AI will likely fail.")
1278
  logger.critical("RUN 'pip install playwright && playwright install --with-deps' in your terminal.")
 
1
+ # main.py (Corrected PermissionError by setting base_directory for Crawl4AI)
2
  import os
3
  import re
4
  import logging
 
90
  # --- Global variable for PTB app ---
91
  ptb_app: Optional[Application] = None
92
 
93
+ # --- Define a writable base directory for Crawl4AI ---
94
+ # Use /app which is the WORKDIR in the Dockerfile
95
+ CRAWL4AI_BASE_DIR = "/app/.crawl4ai_cache"
96
+ if _crawl4ai_available:
97
+ try:
98
+ os.makedirs(CRAWL4AI_BASE_DIR, exist_ok=True)
99
+ logger.info(f"Ensured Crawl4AI base directory exists: {CRAWL4AI_BASE_DIR}")
100
+ except Exception as e:
101
+ logger.error(f"Could not create Crawl4AI base directory {CRAWL4AI_BASE_DIR}: {e}. Crawl4AI caching might fail.")
102
+ # Proceeding, but caching/DB features of Crawl4AI might not work.
103
+
104
+
105
  # --- Environment Variable Loading & Configuration ---
106
  logger.info("Attempting to load secrets and configuration...")
107
  def get_secret(secret_name):
 
231
 
232
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
233
  params = {"token": api_token}
234
+ payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
 
 
 
 
 
 
 
 
 
235
  headers = {"Content-Type": "application/json"}
236
 
237
  try:
238
+ async with httpx.AsyncClient(timeout=120.0) as client:
239
  logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
240
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
241
  logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
 
244
  try:
245
  results = response.json()
246
  if isinstance(results, list) and len(results) > 0:
247
+ item = results[0]; content = None
 
 
248
  if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
249
  elif "text" in item and isinstance(item["text"], str): content = item["text"]
250
  elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
251
+ elif "captions" in item and isinstance(item["captions"], list):
252
  if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
253
  elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
254
 
 
267
  except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
268
 
269
  async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
270
+ global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
271
  if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
272
  logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
273
  transcript_text = None
 
274
  logger.info("[Primary YT] Attempting youtube-transcript-api...")
275
  try:
276
  transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
 
281
  except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
282
  except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
283
 
 
284
  if transcript_text is None:
285
  logger.info("[Fallback YT 1] Trying Supadata API...")
286
  if SUPADATA_API_KEY:
 
289
  else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
290
  else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
291
 
 
292
  if transcript_text is None:
293
  logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
294
+ if _apify_token_exists:
295
  transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
296
  if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
297
  else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
298
  else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
299
 
 
300
  if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
301
  return transcript_text
302
 
 
305
  # --- NEW Primary Method: Crawl4AI ---
306
  async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
307
  """Primary Web Method: Fetches and extracts content using Crawl4AI."""
308
+ global _crawl4ai_primary_scrape_enabled, CRAWL4AI_BASE_DIR # Use the defined base dir
309
  if not _crawl4ai_primary_scrape_enabled:
310
  logger.warning("[Crawl4AI Primary] Called but library is unavailable.")
311
  return None
 
313
  logger.info(f"[Crawl4AI Primary] Attempting fetch and extraction for: {url}")
314
 
315
  run_config = CrawlerRunConfig(
316
+ cache_mode=CacheMode.ENABLED, # Use cache now that base_dir is set
317
  page_timeout=60000, # 60 sec timeout
318
  verbose=False,
319
+ # Consider 'wait_for' or 'scan_full_page=True' for dynamic sites
 
320
  # Consider 'remove_overlay_elements=True' for cookie banners/popups
321
  )
322
+ # *** FIX: Pass base_directory to AsyncWebCrawler ***
323
+ # BrowserConfig defaults are usually fine (headless chromium)
324
  # browser_config = BrowserConfig(headless=True, verbose=False)
325
 
326
  extracted_text: Optional[str] = None
327
  try:
328
+ # Use context manager and provide base_directory
329
+ async with AsyncWebCrawler(base_directory=CRAWL4AI_BASE_DIR) as crawler:
330
+ # Pass browser_config if needed: AsyncWebCrawler(config=browser_config, base_directory=CRAWL4AI_BASE_DIR)
331
  logger.debug(f"[Crawl4AI Primary] Calling crawler.arun for {url}")
332
  result: CrawlResult = await crawler.arun(url=url, config=run_config)
333
  logger.debug(f"[Crawl4AI Primary] arun completed. Success: {result.success}, Status: {result.status_code}")
334
 
335
  if result.success:
336
  if result.markdown and isinstance(result.markdown, MarkdownGenerationResult):
 
337
  if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 30:
338
  extracted_text = result.markdown.fit_markdown.strip()
339
  logger.debug(f"[Crawl4AI Primary] Using fit_markdown for {url}")
 
340
  elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str):
341
  extracted_text = result.markdown.raw_markdown.strip()
342
  logger.debug(f"[Crawl4AI Primary] Using raw_markdown (fit_markdown unavailable/short) for {url}")
343
+ else: logger.warning(f"[Crawl4AI Primary] Markdown object present but no usable text content for {url}")
 
 
344
  elif result.markdown and isinstance(result.markdown, str):
345
  extracted_text = result.markdown.strip()
346
  logger.debug(f"[Crawl4AI Primary] Using direct result.markdown string for {url}")
347
+ elif result.cleaned_html:
348
  logger.warning(f"[Crawl4AI Primary] No markdown found, parsing cleaned_html with BS4 for {url}")
349
  try:
350
  soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER)
351
  extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
352
+ except Exception as bs_err: logger.error(f"[Crawl4AI Primary] Error parsing cleaned_html with BS4 for {url}: {bs_err}"); extracted_text = None
353
+ else: logger.warning(f"[Crawl4AI Primary] Crawl success but no markdown or cleaned_html found for {url}")
 
 
 
354
 
 
355
  if extracted_text and len(extracted_text) > 30:
356
  logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}")
357
  return extracted_text
 
367
  except asyncio.TimeoutError:
368
  logger.error(f"[Crawl4AI Primary] Timeout error during crawl for {url}")
369
  return None
370
+ except ImportError as ie:
371
  if "playwright" in str(ie).lower():
372
  logger.critical(f"[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment. Error: {ie}")
373
+ _crawl4ai_primary_scrape_enabled = False
374
+ else: logger.error(f"[Crawl4AI Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True)
 
375
  return None
376
  except Exception as e:
377
  logger.error(f"[Crawl4AI Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True)
 
378
  if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower()):
379
  logger.critical("[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment.")
380
+ _crawl4ai_primary_scrape_enabled = False
381
  return None
382
 
383
 
 
390
  logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}")
391
  response = await client.get(url)
392
  logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}")
393
+ response.raise_for_status()
394
  content_type = response.headers.get('content-type', '').lower()
395
+ if 'html' not in content_type and 'xml' not in content_type:
396
  logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}")
397
+ if 'text/plain' in content_type: logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, reading."); return response.text
 
 
 
 
 
 
 
398
  return None
399
+ try: return response.text
400
+ except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}"); return None
401
  except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}")
402
  except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}")
403
  except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}")
 
410
  if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None
411
  logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}")
412
  html_content = await fetch_url_content_for_scrape(url)
413
+ if not html_content: logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}."); return None
 
 
414
  try:
 
415
  def parse_html(content: str) -> Optional[str]:
416
  try:
417
  soup = BeautifulSoup(content, DEFAULT_PARSER)
418
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]): element.extract()
419
+ main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
 
 
 
 
 
420
  target_element = main_content if main_content else soup.body
421
  if not target_element:
422
+ logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content for {url}")
423
  text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
424
+ if text_from_root and len(text_from_root) > 50: logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}."); return text_from_root
 
 
425
  return None
426
  lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
427
  text = " ".join(lines)
428
+ if not text or len(text) < 50: logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text too short or empty for {url}. Length: {len(text)}"); return None
 
 
429
  return text
430
+ except Exception as parse_e: logger.error(f"[Web Scrape Fallback 1 Parse] BS4 parsing error for {url}: {parse_e}", exc_info=False); return None
 
 
 
 
431
  text_content = await asyncio.to_thread(parse_html, html_content)
432
+ if text_content: logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (len: {len(text_content)})"); return text_content
433
+ else: logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no content for {url}."); return None
434
+ except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing for {url}: {e}", exc_info=True); return None
 
 
 
 
 
 
 
435
 
436
  # --- Fallback 2: urltotext.com API ---
437
  async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
 
470
  if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
471
  logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API")
472
  api_host = "scrapers-proxy2.p.rapidapi.com"
473
+ encoded_url = urllib.parse.quote(url, safe='')
474
  api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
475
  headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
476
  try:
 
484
  content = data.get("content"); title = data.get("title"); extracted_text = ""
485
  if title and isinstance(title, str): extracted_text += title.strip() + ". "
486
  if content and isinstance(content, str): extracted_text += content.strip()
487
+ if extracted_text and len(extracted_text) > 30: logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy API for {url}. Len: {len(extracted_text)}"); return extracted_text
488
+ else: logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title too short/empty for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}"); return None
 
 
 
 
489
  except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
490
  except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
491
  elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
492
+ elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check subscription/limits."); return None
493
  elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
494
  elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
495
  else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
 
524
  return None
525
  except json.JSONDecodeError:
526
  raw_text = response.text
527
+ if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}"); return raw_text.strip()
528
  else: logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None
529
  except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None
530
  elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None
531
+ elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check subscription/limits."); return None
532
  elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None
533
  elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
534
  else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
 
564
  elif "content" in item and isinstance(item["content"], str): content = item["content"]
565
  elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"]
566
  elif "html" in item and isinstance(item["html"], str):
567
+ logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text' or 'markdown' found, parsing 'html'.")
568
  soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
569
  content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
570
 
 
641
  "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
642
  "Here is the text to summarise:")
643
 
644
+ MAX_INPUT_LENGTH_GEMINI = 900000
645
  if len(text) > MAX_INPUT_LENGTH_GEMINI:
646
  logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
647
  text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
 
726
  "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
727
  "Here is the text to summarise:")
728
 
729
+ MAX_INPUT_LENGTH_OR = 100000
730
  if len(text) > MAX_INPUT_LENGTH_OR:
731
  logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}). Truncating.");
732
  text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
 
818
 
819
  try:
820
  # --- 1. Initial User Feedback ---
821
+ processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... might take a moment)..."
822
  if status_message_id:
823
  try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
824
  except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None
 
835
  is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
836
 
837
  if is_youtube:
838
+ # --- YouTube Transcript Logic ---
839
  video_id = extract_youtube_id(url)
840
+ if video_id: content = await get_youtube_transcript(video_id, url)
841
  else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
842
  if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
843
  else:
844
  # --- Website Scraping Logic (NEW Order: Crawl4AI -> Direct+BS4 -> APIs -> Apify) ---
845
+ global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN, _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists, _crawl4ai_primary_scrape_enabled
 
 
846
 
847
+ # Method 0: Primary Scrape (Crawl4AI)
848
  logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Crawl4AI)...")
849
  if _crawl4ai_primary_scrape_enabled:
850
  content = await get_website_content_via_crawl4ai(url)
851
  if not content: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed.")
852
+ else: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library/driver unavailable.")
 
853
 
854
+ # Method 1: Fallback 1 (Direct Fetch + BS4)
855
  if not content:
856
  logger.warning(f"[Task {task_id}] Method 0 failed. Trying Method 1 (Direct Fetch + BS4)...")
857
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
858
  content = await get_website_content_direct_bs4(url)
859
  if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.")
860
 
861
+ # Method 2: Fallback 2 (urltotext.com)
862
  if not content:
863
  logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
864
  if _urltotext_key_exists:
 
867
  if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.")
868
  else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
869
 
870
+ # Method 3: Fallback 3 (Scraper's Proxy via RapidAPI)
871
  if not content:
872
  logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
873
  if _rapidapi_key_exists:
 
876
  if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.")
877
  else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
878
 
879
+ # Method 4: Fallback 4 (AI Web Scraper via RapidAPI)
880
  if not content:
881
  logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...")
882
  if _rapidapi_key_exists:
 
885
  if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.")
886
  else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
887
 
888
+ # Method 5: Fallback 5 (Apify Website Content Crawler)
889
  if not content:
890
  logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...")
891
  if _apify_token_exists:
 
894
  if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.")
895
  else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.")
896
 
897
+ # Method 6: Fallback 6 (Apify Text Scraper Free)
898
  if not content:
899
  logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...")
900
  if _apify_token_exists:
 
903
  if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.")
904
  else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.")
905
 
906
+ # Final check
907
  if not content and not user_feedback_message:
908
+ user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed."
909
 
910
  # --- 3. Summarization ---
911
  if content:
912
+ logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.")
 
913
  try:
914
  status_update_msg_id = message_to_delete_later_id or status_message_id
915
  if status_update_msg_id:
 
917
  except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
918
 
919
  await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
920
+ final_summary = await generate_summary(content, summary_type)
921
 
922
  if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
923
+ user_feedback_message = final_summary
924
  logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
925
  else:
926
  max_length = 4096
927
  summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
928
  await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
929
  for part in summary_parts[1:]:
930
+ await asyncio.sleep(0.5)
931
  await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
932
  success = True
933
  logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
934
+ user_feedback_message = None
935
 
936
  # --- 4. Handle Final Failure Feedback ---
937
+ if user_feedback_message:
938
  logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
939
  await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
940
 
 
961
  logger.info(f"[Task {task_id}] Task completed. Success: {success}")
962
 
963
 
964
+ # --- Telegram Handlers ---
965
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
966
  user = update.effective_user; mention = user.mention_html()
967
  if not user or not update.message: return
 
977
  "2. I'll ask how you want it summarised (paragraph or points).\n"
978
  "3. Click the button for your choice.\n"
979
  "4. Wait while I fetch the content and generate the summary!\n\n"
980
+ "⚙️ I use multiple methods to get content, starting with an advanced crawler and falling back to simpler methods if needed.\n\n" # Updated help text
981
  "**Commands:**\n"
982
  "`/start` - Display the welcome message\n"
983
  "`/help` - Show this help message" )
 
988
  url = update.message.text.strip(); user = update.effective_user
989
  if not user: return
990
  url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE)
991
+ if not url_pattern.search(url): logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}"); return
 
 
992
 
993
  match = url_pattern.search(url)
994
  if match:
 
1001
  try:
1002
  await update.message.reply_text(
1003
  f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?",
1004
+ reply_markup=reply_markup, disable_web_page_preview=True, parse_mode=ParseMode.MARKDOWN )
 
 
 
1005
  except BadRequest as e:
1006
  if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).")
1007
  else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}")
1008
  except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True)
1009
+ else: logger.debug(f"Ignoring message from {user.id} - no URL found by regex: {url[:100]}")
1010
 
1011
 
1012
  async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 
1024
 
1025
  if not url:
1026
  logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Old button?")
1027
+ try: await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.")
 
1028
  except BadRequest as e:
1029
  if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass
1030
  else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
1031
+ except Exception as e: logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
 
1032
  return
1033
 
1034
  context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
 
1036
  global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
1037
  if not TELEGRAM_TOKEN:
1038
  logger.critical("TELEGRAM_TOKEN missing in callback!")
1039
+ try: await query.edit_message_text(text="❌ Bot config error (Token Missing).")
1040
+ except Exception: pass
 
 
 
1041
  return
1042
  if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
1043
  logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!")
1044
+ try: await query.edit_message_text(text="❌ AI configuration error: No summarization models available.")
1045
+ except Exception: pass
 
 
 
1046
  return
1047
  elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) unavailable, relying on fallback.")
1048
  elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) unavailable, relying on primary.")
 
1058
  logger.error("Exception while handling an update:", exc_info=context.error)
1059
 
1060
 
1061
+ # --- Application Setup & Web Framework ---
1062
 
1063
  async def setup_bot_config() -> Application:
1064
  logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
 
1107
  await ptb_app.bot.set_webhook(**set_webhook_args)
1108
  webhook_info = await ptb_app.bot.get_webhook_info()
1109
  if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
1110
+ else: logger.error(f"Webhook URL mismatch! Expected '{full_webhook_url}', Got '{webhook_info.url}'.")
1111
  await ptb_app.start()
1112
  logger.info("PTB Application started in webhook mode.")
1113
  except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e
1114
  else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL could not be determined.")
1115
  else: logger.critical("SPACE_HOST env var not found."); raise RuntimeError("SPACE_HOST env var missing.")
1116
 
1117
+ logger.info("ASGI Lifespan: Startup complete."); yield
1118
 
1119
  except Exception as startup_err:
1120
  logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
 
1139
 
1140
  async def health_check(request: Request) -> PlainTextResponse:
1141
  """Simple health check endpoint."""
 
1142
  global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
1143
  global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled
1144
  bot_status = "Not Initialized"; bot_username = "N/A"
 
1159
  except Exception as e: bot_status = f"Error checking: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
1160
  else: bot_status = "Not Initialized"; bot_username = "N/A"
1161
 
 
1162
  return PlainTextResponse(
1163
  f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
1164
  f"--- Summarization ---\n"
 
1169
  f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
1170
  f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
1171
  f"--- Website Scraping ---\n"
1172
+ f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Library/Driver Missing?'}\n"
1173
  f"Fallback 1 (Direct+BS4): Enabled\n"
1174
  f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
1175
  f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
 
1196
  except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
1197
  except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to prevent TG retries
1198
 
1199
+ # --- Starlette App Definition ---
1200
  app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
1201
  logger.info("Starlette ASGI application created with health check and webhook routes.")
1202
 
1203
+ # --- Development Server ---
1204
  if __name__ == '__main__':
1205
  import uvicorn
1206
  logger.warning("Running in development mode using Uvicorn directly - NOT for production!")
 
1207
  try:
1208
  from playwright.async_api import async_playwright
1209
  logger.info("Playwright library found.")
1210
+ # Consider adding: asyncio.run(install_playwright_drivers()) to ensure they exist
1211
+ # async def install_playwright_drivers():
1212
+ # import sys
1213
+ # from playwright.__main__ import main
1214
+ # logger.info("Attempting to install Playwright browser drivers...")
1215
+ # try:
1216
+ # # Execute the playwright install command programmatically
1217
+ # sys.argv = ["playwright", "install", "--with-deps"]
1218
+ # main()
1219
+ # logger.info("Playwright install command finished.")
1220
+ # except Exception as install_err:
1221
+ # logger.error(f"Playwright install command failed: {install_err}")
1222
+
1223
  except ImportError:
1224
  logger.critical("Playwright library not found. Crawl4AI will likely fail.")
1225
  logger.critical("RUN 'pip install playwright && playwright install --with-deps' in your terminal.")