Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on 27 days ago

Commit

b1bdfa0

verified ·

1 Parent(s): c261e5f

Update main.py

Browse files

Files changed (1) hide show

main.py +109 -162

main.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# main.py (Corrected SyntaxError at lines 1097 & 1101 - Now with Crawl4AI as Primary Scraper)
 import os
 import re
 import logging
@@ -90,6 +90,18 @@ if not _crawl4ai_available: logger.warning("crawl4ai library not found. Primary
 # --- Global variable for PTB app ---
 ptb_app: Optional[Application] = None
 # --- Environment Variable Loading & Configuration ---
 logger.info("Attempting to load secrets and configuration...")
 def get_secret(secret_name):
@@ -219,20 +231,11 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
     sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
     params = {"token": api_token}
-    # Input specific to karamelo~youtube-transcripts actor
-    payload = {
-        "urls": [video_url],
-        "outputFormat": "singleStringText",
-        "maxRetries": 5,
-        "channelHandleBoolean": False,
-        "channelNameBoolean": False,
-        "datePublishedBoolean": False,
-        "relativeDateTextBoolean": False,
-    }
     headers = {"Content-Type": "application/json"}
     try:
-        async with httpx.AsyncClient(timeout=120.0) as client: # Long timeout for potential YT processing
             logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
             logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
@@ -241,13 +244,11 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
                 try:
                     results = response.json()
                     if isinstance(results, list) and len(results) > 0:
-                        item = results[0]
-                        content = None
-                        # Check common keys for transcript text
                         if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
                         elif "text" in item and isinstance(item["text"], str): content = item["text"]
                         elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
-                        elif "captions" in item and isinstance(item["captions"], list): # Handle list format if needed
                              if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
                              elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
@@ -266,11 +267,10 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
     except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
-    global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists # Added _apify_token_exists global ref
     if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
     logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
     transcript_text = None
-    # Method 1: youtube-transcript-api (Primary)
     logger.info("[Primary YT] Attempting youtube-transcript-api...")
     try:
         transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
@@ -281,7 +281,6 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
     except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
     except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
-    # Method 2: Supadata (Fallback 1)
     if transcript_text is None:
         logger.info("[Fallback YT 1] Trying Supadata API...")
         if SUPADATA_API_KEY:
@@ -290,16 +289,14 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
             else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
         else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
-    # Method 3: Apify (Fallback 2 - Default YT Actor)
     if transcript_text is None:
         logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
-        if _apify_token_exists: # Use the global flag
             transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
             if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
             else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
         else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
-    # Final Result
     if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
     return transcript_text
@@ -308,7 +305,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
 # --- NEW Primary Method: Crawl4AI ---
 async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
     """Primary Web Method: Fetches and extracts content using Crawl4AI."""
-    global _crawl4ai_primary_scrape_enabled
     if not _crawl4ai_primary_scrape_enabled:
         logger.warning("[Crawl4AI Primary] Called but library is unavailable.")
         return None
@@ -316,52 +313,45 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
     logger.info(f"[Crawl4AI Primary] Attempting fetch and extraction for: {url}")
     run_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
         page_timeout=60000, # 60 sec timeout
         verbose=False,
-        # Rely on default markdown strategy for now, prioritize 'fit_markdown' if available
-        # Consider adding 'wait_for' or 'scan_full_page=True' if needed for dynamic sites
         # Consider 'remove_overlay_elements=True' for cookie banners/popups
     )
-    # Default BrowserConfig is headless chromium, which is usually fine
     # browser_config = BrowserConfig(headless=True, verbose=False)
     extracted_text: Optional[str] = None
     try:
-        # Use context manager: async with AsyncWebCrawler(config=browser_config) as crawler:
-        async with AsyncWebCrawler() as crawler:
             logger.debug(f"[Crawl4AI Primary] Calling crawler.arun for {url}")
             result: CrawlResult = await crawler.arun(url=url, config=run_config)
             logger.debug(f"[Crawl4AI Primary] arun completed. Success: {result.success}, Status: {result.status_code}")
             if result.success:
                 if result.markdown and isinstance(result.markdown, MarkdownGenerationResult):
-                    # Prefer fit_markdown if it exists and has content
                     if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 30:
                         extracted_text = result.markdown.fit_markdown.strip()
                         logger.debug(f"[Crawl4AI Primary] Using fit_markdown for {url}")
-                    # Fallback to raw_markdown
                     elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str):
                         extracted_text = result.markdown.raw_markdown.strip()
                         logger.debug(f"[Crawl4AI Primary] Using raw_markdown (fit_markdown unavailable/short) for {url}")
-                    else:
-                        logger.warning(f"[Crawl4AI Primary] Markdown object present but no usable text content for {url}")
-                # Legacy/Alternative checks (less likely with v0.5+)
                 elif result.markdown and isinstance(result.markdown, str):
                      extracted_text = result.markdown.strip()
                      logger.debug(f"[Crawl4AI Primary] Using direct result.markdown string for {url}")
-                elif result.cleaned_html: # Last resort: parse cleaned HTML
                     logger.warning(f"[Crawl4AI Primary] No markdown found, parsing cleaned_html with BS4 for {url}")
                     try:
                          soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER)
                          extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
-                    except Exception as bs_err:
-                         logger.error(f"[Crawl4AI Primary] Error parsing cleaned_html with BS4 for {url}: {bs_err}")
-                         extracted_text = None
-                else:
-                    logger.warning(f"[Crawl4AI Primary] Crawl success but no markdown or cleaned_html found for {url}")
-                # Final length check
                 if extracted_text and len(extracted_text) > 30:
                     logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}")
                     return extracted_text
@@ -377,19 +367,17 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
     except asyncio.TimeoutError:
          logger.error(f"[Crawl4AI Primary] Timeout error during crawl for {url}")
          return None
-    except ImportError as ie: # Catch if playwright drivers aren't installed
          if "playwright" in str(ie).lower():
               logger.critical(f"[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment. Error: {ie}")
-              _crawl4ai_primary_scrape_enabled = False # Disable for future calls in this run
-         else:
-              logger.error(f"[Crawl4AI Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True)
          return None
     except Exception as e:
         logger.error(f"[Crawl4AI Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True)
-        # Check if it's a playwright installation issue
         if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower()):
              logger.critical("[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment.")
-             _crawl4ai_primary_scrape_enabled = False # Disable for future calls
         return None
@@ -402,19 +390,14 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
             logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}")
             response = await client.get(url)
             logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}")
-            response.raise_for_status() # Raise HTTPStatusError for 4xx/5xx
             content_type = response.headers.get('content-type', '').lower()
-            if 'html' not in content_type and 'xml' not in content_type: # Allow xml just in case
                 logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}")
-                if 'text/plain' in content_type:
-                     logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, attempting to read.")
-                     return response.text # Return plain text directly
-                return None # Skip other non-html types
-            try:
-                return response.text # Attempt to decode text, handle potential errors
-            except Exception as e:
-                logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}")
                 return None
     except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}")
     except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}")
     except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}")
@@ -427,51 +410,28 @@ async def get_website_content_direct_bs4(url: str) -> Optional[str]:
     if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None
     logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}")
     html_content = await fetch_url_content_for_scrape(url)
-    if not html_content:
-        logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}.")
-        return None
     try:
-        # --- Parsing logic (run in thread to avoid blocking) ---
         def parse_html(content: str) -> Optional[str]:
             try:
                 soup = BeautifulSoup(content, DEFAULT_PARSER)
-                for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]):
-                    element.extract()
-                main_content = soup.find('main') or \
-                               soup.find('article') or \
-                               soup.find(role='main') or \
-                               soup.find(id=re.compile(r'content|main|body', re.I)) or \
-                               soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
                 target_element = main_content if main_content else soup.body
                 if not target_element:
-                    logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content container for {url}")
                     text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
-                    if text_from_root and len(text_from_root) > 50:
-                         logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}.")
-                         return text_from_root
                     return None
                 lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
                 text = " ".join(lines)
-                if not text or len(text) < 50: # Adjust threshold as needed
-                    logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text seems too short or empty after cleaning for {url}. Length: {len(text)}")
-                    return None
                 return text
-            except Exception as parse_e:
-                logger.error(f"[Web Scrape Fallback 1 Parse] Error during BeautifulSoup parsing for {url}: {parse_e}", exc_info=False) # Keep log cleaner
-                return None
-        # --- End parsing logic ---
         text_content = await asyncio.to_thread(parse_html, html_content)
-        if text_content:
-            logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (final len: {len(text_content)})")
-            return text_content
-        else:
-            logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no content for {url}.")
-            return None
-    except Exception as e:
-        logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing process for {url}: {e}", exc_info=True)
-        return None
 # --- Fallback 2: urltotext.com API ---
 async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
@@ -510,7 +470,7 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
     if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
     logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API")
     api_host = "scrapers-proxy2.p.rapidapi.com"
-    encoded_url = urllib.parse.quote(url, safe='') # URL Encode the target URL
     api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
     headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
     try:
@@ -524,16 +484,12 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
                     content = data.get("content"); title = data.get("title"); extracted_text = ""
                     if title and isinstance(title, str): extracted_text += title.strip() + ". "
                     if content and isinstance(content, str): extracted_text += content.strip()
-                    if extracted_text and len(extracted_text) > 30:
-                        logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy Parser API for {url}. Len: {len(extracted_text)}")
-                        return extracted_text
-                    else:
-                        logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title seems empty or too short for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}")
-                        return None
                 except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
                 except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
             elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
-            elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
             elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
             elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
             else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
@@ -568,11 +524,11 @@ async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Opti
                         return None
                 except json.JSONDecodeError:
                     raw_text = response.text
-                    if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text content. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}"); return raw_text.strip()
                     else: logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None
                 except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None
             elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None
-            elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
             elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None
             elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
             else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
@@ -608,7 +564,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                         elif "content" in item and isinstance(item["content"], str): content = item["content"]
                         elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"]
                         elif "html" in item and isinstance(item["html"], str):
-                             logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text' or 'markdown' found, attempting to parse 'html'.")
                              soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
                              content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
@@ -685,7 +641,7 @@ async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Opt
                   "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
                   "Here is the text to summarise:")
-    MAX_INPUT_LENGTH_GEMINI = 900000 # Check model docs for actual limit
     if len(text) > MAX_INPUT_LENGTH_GEMINI:
         logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
         text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
@@ -770,7 +726,7 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
                   "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
                   "Here is the text to summarise:")
-    MAX_INPUT_LENGTH_OR = 100000 # Conservative limit
     if len(text) > MAX_INPUT_LENGTH_OR:
         logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}). Truncating.");
         text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
@@ -862,7 +818,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
     try:
         # --- 1. Initial User Feedback ---
-        processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... might take a moment)..." # Updated text
         if status_message_id:
             try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
             except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None
@@ -879,33 +835,30 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
             is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
             if is_youtube:
-                # --- YouTube Transcript Logic (Unchanged) ---
                 video_id = extract_youtube_id(url)
-                if video_id: content = await get_youtube_transcript(video_id, url) # Tries lib -> Supadata -> Apify YT Actor
                 else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
                 if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
             else:
                 # --- Website Scraping Logic (NEW Order: Crawl4AI -> Direct+BS4 -> APIs -> Apify) ---
-                global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN
-                global _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists
-                global _crawl4ai_primary_scrape_enabled # Ensure global access
-                # Method 0: Primary Scrape (Crawl4AI - NEW)
                 logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Crawl4AI)...")
                 if _crawl4ai_primary_scrape_enabled:
                     content = await get_website_content_via_crawl4ai(url)
                     if not content: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed.")
-                else:
-                    logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library unavailable.")
-                # Method 1: Fallback 1 (Direct Fetch + BS4 - WAS Primary)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 0 failed. Trying Method 1 (Direct Fetch + BS4)...")
                     await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
                     content = await get_website_content_direct_bs4(url)
                     if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.")
-                # Method 2: Fallback 2 (urltotext.com - WAS Fallback 1)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
                     if _urltotext_key_exists:
@@ -914,7 +867,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
                          if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.")
                     else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
-                # Method 3: Fallback 3 (Scraper's Proxy via RapidAPI - WAS Fallback 2)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
                     if _rapidapi_key_exists:
@@ -923,7 +876,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
                         if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.")
                     else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
-                # Method 4: Fallback 4 (AI Web Scraper via RapidAPI - WAS Fallback 3)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...")
                     if _rapidapi_key_exists:
@@ -932,7 +885,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
                         if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.")
                     else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
-                # Method 5: Fallback 5 (Apify Website Content Crawler - WAS Fallback 4)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...")
                     if _apify_token_exists:
@@ -941,7 +894,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
                         if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.")
                     else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.")
-                # Method 6: Fallback 6 (Apify Text Scraper Free - WAS Fallback 5)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...")
                     if _apify_token_exists:
@@ -950,14 +903,13 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
                         if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.")
                     else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.")
-                # Final check for website content after all methods
                 if not content and not user_feedback_message:
-                    user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed." # Updated message
             # --- 3. Summarization ---
             if content:
-                logger.info(f"[Task {task_id}] Content fetched successfully (len:{len(content)}). Generating summary.")
-                # Update status message before summarization
                 try:
                      status_update_msg_id = message_to_delete_later_id or status_message_id
                      if status_update_msg_id:
@@ -965,24 +917,24 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
                 except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
                 await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
-                final_summary = await generate_summary(content, summary_type) # Calls Gemini -> OpenRouter
                 if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
-                    user_feedback_message = final_summary # Use the error message from summarizer
                     logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
                 else:
                     max_length = 4096
                     summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
                     await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
                     for part in summary_parts[1:]:
-                        await asyncio.sleep(0.5) # Small delay between parts
                         await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
                     success = True
                     logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
-                    user_feedback_message = None # Clear any previous error message
             # --- 4. Handle Final Failure Feedback ---
-            if user_feedback_message: # If any step failed and set a message
                 logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
                 await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
@@ -1009,7 +961,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
         logger.info(f"[Task {task_id}] Task completed. Success: {success}")
-# --- Telegram Handlers (Unchanged, except callback syntax fix) ---
 async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
     user = update.effective_user; mention = user.mention_html()
     if not user or not update.message: return
@@ -1025,7 +977,7 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
                   "2. I'll ask how you want it summarised (paragraph or points).\n"
                   "3. Click the button for your choice.\n"
                   "4. Wait while I fetch the content and generate the summary!\n\n"
-                  "⚙️ I try multiple methods to get content, especially for tricky websites or YouTube videos without standard transcripts.\n\n"
                   "**Commands:**\n"
                   "`/start` - Display the welcome message\n"
                   "`/help` - Show this help message" )
@@ -1036,9 +988,7 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
     url = update.message.text.strip(); user = update.effective_user
     if not user: return
     url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE)
-    if not url_pattern.search(url):
-        logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}")
-        return
     match = url_pattern.search(url)
     if match:
@@ -1051,15 +1001,12 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
         try:
             await update.message.reply_text(
                 f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?",
-                reply_markup=reply_markup,
-                disable_web_page_preview=True,
-                parse_mode=ParseMode.MARKDOWN
-            )
         except BadRequest as e:
              if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).")
              else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}")
         except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True)
-    else: logger.debug(f"Ignoring message from {user.id} that passed initial check but no URL found: {url[:100]}")
 async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
@@ -1077,13 +1024,11 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
     if not url:
         logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Old button?")
-        try:
-            await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.")
         except BadRequest as e:
             if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass
             else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
-        except Exception as e:
-            logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
         return
     context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
@@ -1091,19 +1036,13 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
     global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
     if not TELEGRAM_TOKEN:
         logger.critical("TELEGRAM_TOKEN missing in callback!")
-        # *** SYNTAX FIX HERE ***
-        try:
-            await query.edit_message_text(text="❌ Bot config error (Token Missing).")
-        except Exception:
-            pass # Ignore if editing fails
         return
     if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
         logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!")
-        # *** SYNTAX FIX HERE ***
-        try:
-            await query.edit_message_text(text="❌ AI configuration error: No summarization models available.")
-        except Exception:
-            pass # Ignore if editing fails
         return
     elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) unavailable, relying on fallback.")
     elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) unavailable, relying on primary.")
@@ -1119,7 +1058,7 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
     logger.error("Exception while handling an update:", exc_info=context.error)
-# --- Application Setup & Web Framework (MODIFIED Health Check) ---
 async def setup_bot_config() -> Application:
     logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
@@ -1168,14 +1107,14 @@ async def lifespan(app: Starlette):
                     await ptb_app.bot.set_webhook(**set_webhook_args)
                     webhook_info = await ptb_app.bot.get_webhook_info()
                     if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
-                    else: logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'.")
                     await ptb_app.start()
                     logger.info("PTB Application started in webhook mode.")
                 except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e
             else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL could not be determined.")
         else: logger.critical("SPACE_HOST env var not found."); raise RuntimeError("SPACE_HOST env var missing.")
-        logger.info("ASGI Lifespan: Startup complete."); yield # --- Application runs here ---
     except Exception as startup_err:
         logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
@@ -1200,7 +1139,6 @@ async def lifespan(app: Starlette):
 async def health_check(request: Request) -> PlainTextResponse:
     """Simple health check endpoint."""
-    # ADDED _crawl4ai_primary_scrape_enabled
     global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
     global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled
     bot_status = "Not Initialized"; bot_username = "N/A"
@@ -1221,7 +1159,6 @@ async def health_check(request: Request) -> PlainTextResponse:
         except Exception as e: bot_status = f"Error checking: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
     else: bot_status = "Not Initialized"; bot_username = "N/A"
-    # Updated health check output
     return PlainTextResponse(
         f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
         f"--- Summarization ---\n"
@@ -1232,7 +1169,7 @@ async def health_check(request: Request) -> PlainTextResponse:
         f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
         f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
         f"--- Website Scraping ---\n"
-        f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Library/Driver Missing?'}\n" # Updated message
         f"Fallback 1 (Direct+BS4): Enabled\n"
         f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
         f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
@@ -1259,20 +1196,30 @@ async def telegram_webhook(request: Request) -> Response:
     except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
     except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to prevent TG retries
-# --- Starlette App Definition (Unchanged) ---
 app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
 logger.info("Starlette ASGI application created with health check and webhook routes.")
-# --- Development Server (Unchanged) ---
 if __name__ == '__main__':
     import uvicorn
     logger.warning("Running in development mode using Uvicorn directly - NOT for production!")
-    # One-time check/reminder for Playwright install during local dev
     try:
          from playwright.async_api import async_playwright
          logger.info("Playwright library found.")
-         # Could add a check here to see if browsers are actually installed,
-         # but 'playwright install' is the general fix.
     except ImportError:
          logger.critical("Playwright library not found. Crawl4AI will likely fail.")
          logger.critical("RUN 'pip install playwright && playwright install --with-deps' in your terminal.")

+# main.py (Corrected PermissionError by setting base_directory for Crawl4AI)
 import os
 import re
 import logging
 # --- Global variable for PTB app ---
 ptb_app: Optional[Application] = None
+# --- Define a writable base directory for Crawl4AI ---
+# Use /app which is the WORKDIR in the Dockerfile
+CRAWL4AI_BASE_DIR = "/app/.crawl4ai_cache"
+if _crawl4ai_available:
+    try:
+        os.makedirs(CRAWL4AI_BASE_DIR, exist_ok=True)
+        logger.info(f"Ensured Crawl4AI base directory exists: {CRAWL4AI_BASE_DIR}")
+    except Exception as e:
+        logger.error(f"Could not create Crawl4AI base directory {CRAWL4AI_BASE_DIR}: {e}. Crawl4AI caching might fail.")
+        # Proceeding, but caching/DB features of Crawl4AI might not work.
 # --- Environment Variable Loading & Configuration ---
 logger.info("Attempting to load secrets and configuration...")
 def get_secret(secret_name):
     sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
     params = {"token": api_token}
+    payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
     headers = {"Content-Type": "application/json"}
     try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
             logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
             logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
                 try:
                     results = response.json()
                     if isinstance(results, list) and len(results) > 0:
+                        item = results[0]; content = None
                         if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
                         elif "text" in item and isinstance(item["text"], str): content = item["text"]
                         elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
+                        elif "captions" in item and isinstance(item["captions"], list):
                              if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
                              elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
     except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
 async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
+    global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
     if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
     logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
     transcript_text = None
     logger.info("[Primary YT] Attempting youtube-transcript-api...")
     try:
         transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
     except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
     except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
     if transcript_text is None:
         logger.info("[Fallback YT 1] Trying Supadata API...")
         if SUPADATA_API_KEY:
             else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
         else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
     if transcript_text is None:
         logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
+        if _apify_token_exists:
             transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
             if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
             else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
         else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
     if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
     return transcript_text
 # --- NEW Primary Method: Crawl4AI ---
 async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
     """Primary Web Method: Fetches and extracts content using Crawl4AI."""
+    global _crawl4ai_primary_scrape_enabled, CRAWL4AI_BASE_DIR # Use the defined base dir
     if not _crawl4ai_primary_scrape_enabled:
         logger.warning("[Crawl4AI Primary] Called but library is unavailable.")
         return None
     logger.info(f"[Crawl4AI Primary] Attempting fetch and extraction for: {url}")
     run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED, # Use cache now that base_dir is set
         page_timeout=60000, # 60 sec timeout
         verbose=False,
+        # Consider 'wait_for' or 'scan_full_page=True' for dynamic sites
         # Consider 'remove_overlay_elements=True' for cookie banners/popups
     )
+    # *** FIX: Pass base_directory to AsyncWebCrawler ***
+    # BrowserConfig defaults are usually fine (headless chromium)
     # browser_config = BrowserConfig(headless=True, verbose=False)
     extracted_text: Optional[str] = None
     try:
+        # Use context manager and provide base_directory
+        async with AsyncWebCrawler(base_directory=CRAWL4AI_BASE_DIR) as crawler:
+             # Pass browser_config if needed: AsyncWebCrawler(config=browser_config, base_directory=CRAWL4AI_BASE_DIR)
             logger.debug(f"[Crawl4AI Primary] Calling crawler.arun for {url}")
             result: CrawlResult = await crawler.arun(url=url, config=run_config)
             logger.debug(f"[Crawl4AI Primary] arun completed. Success: {result.success}, Status: {result.status_code}")
             if result.success:
                 if result.markdown and isinstance(result.markdown, MarkdownGenerationResult):
                     if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 30:
                         extracted_text = result.markdown.fit_markdown.strip()
                         logger.debug(f"[Crawl4AI Primary] Using fit_markdown for {url}")
                     elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str):
                         extracted_text = result.markdown.raw_markdown.strip()
                         logger.debug(f"[Crawl4AI Primary] Using raw_markdown (fit_markdown unavailable/short) for {url}")
+                    else: logger.warning(f"[Crawl4AI Primary] Markdown object present but no usable text content for {url}")
                 elif result.markdown and isinstance(result.markdown, str):
                      extracted_text = result.markdown.strip()
                      logger.debug(f"[Crawl4AI Primary] Using direct result.markdown string for {url}")
+                elif result.cleaned_html:
                     logger.warning(f"[Crawl4AI Primary] No markdown found, parsing cleaned_html with BS4 for {url}")
                     try:
                          soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER)
                          extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
+                    except Exception as bs_err: logger.error(f"[Crawl4AI Primary] Error parsing cleaned_html with BS4 for {url}: {bs_err}"); extracted_text = None
+                else: logger.warning(f"[Crawl4AI Primary] Crawl success but no markdown or cleaned_html found for {url}")
                 if extracted_text and len(extracted_text) > 30:
                     logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}")
                     return extracted_text
     except asyncio.TimeoutError:
          logger.error(f"[Crawl4AI Primary] Timeout error during crawl for {url}")
          return None
+    except ImportError as ie:
          if "playwright" in str(ie).lower():
               logger.critical(f"[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment. Error: {ie}")
+              _crawl4ai_primary_scrape_enabled = False
+         else: logger.error(f"[Crawl4AI Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True)
          return None
     except Exception as e:
         logger.error(f"[Crawl4AI Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True)
         if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower()):
              logger.critical("[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment.")
+             _crawl4ai_primary_scrape_enabled = False
         return None
             logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}")
             response = await client.get(url)
             logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}")
+            response.raise_for_status()
             content_type = response.headers.get('content-type', '').lower()
+            if 'html' not in content_type and 'xml' not in content_type:
                 logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}")
+                if 'text/plain' in content_type: logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, reading."); return response.text
                 return None
+            try: return response.text
+            except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}"); return None
     except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}")
     except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}")
     except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}")
     if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None
     logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}")
     html_content = await fetch_url_content_for_scrape(url)
+    if not html_content: logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}."); return None
     try:
         def parse_html(content: str) -> Optional[str]:
             try:
                 soup = BeautifulSoup(content, DEFAULT_PARSER)
+                for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]): element.extract()
+                main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
                 target_element = main_content if main_content else soup.body
                 if not target_element:
+                    logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content for {url}")
                     text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
+                    if text_from_root and len(text_from_root) > 50: logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}."); return text_from_root
                     return None
                 lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
                 text = " ".join(lines)
+                if not text or len(text) < 50: logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text too short or empty for {url}. Length: {len(text)}"); return None
                 return text
+            except Exception as parse_e: logger.error(f"[Web Scrape Fallback 1 Parse] BS4 parsing error for {url}: {parse_e}", exc_info=False); return None
         text_content = await asyncio.to_thread(parse_html, html_content)
+        if text_content: logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (len: {len(text_content)})"); return text_content
+        else: logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no content for {url}."); return None
+    except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing for {url}: {e}", exc_info=True); return None
 # --- Fallback 2: urltotext.com API ---
 async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
     if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
     logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API")
     api_host = "scrapers-proxy2.p.rapidapi.com"
+    encoded_url = urllib.parse.quote(url, safe='')
     api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
     headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
     try:
                     content = data.get("content"); title = data.get("title"); extracted_text = ""
                     if title and isinstance(title, str): extracted_text += title.strip() + ". "
                     if content and isinstance(content, str): extracted_text += content.strip()
+                    if extracted_text and len(extracted_text) > 30: logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy API for {url}. Len: {len(extracted_text)}"); return extracted_text
+                    else: logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title too short/empty for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}"); return None
                 except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
                 except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
             elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
+            elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check subscription/limits."); return None
             elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
             elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
             else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
                         return None
                 except json.JSONDecodeError:
                     raw_text = response.text
+                    if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}"); return raw_text.strip()
                     else: logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None
                 except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None
             elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None
+            elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check subscription/limits."); return None
             elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None
             elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
             else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
                         elif "content" in item and isinstance(item["content"], str): content = item["content"]
                         elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"]
                         elif "html" in item and isinstance(item["html"], str):
+                             logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text' or 'markdown' found, parsing 'html'.")
                              soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
                              content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
                   "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
                   "Here is the text to summarise:")
+    MAX_INPUT_LENGTH_GEMINI = 900000
     if len(text) > MAX_INPUT_LENGTH_GEMINI:
         logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
         text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
                   "• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
                   "Here is the text to summarise:")
+    MAX_INPUT_LENGTH_OR = 100000
     if len(text) > MAX_INPUT_LENGTH_OR:
         logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}). Truncating.");
         text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
     try:
         # --- 1. Initial User Feedback ---
+        processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... might take a moment)..."
         if status_message_id:
             try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
             except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None
             is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
             if is_youtube:
+                # --- YouTube Transcript Logic ---
                 video_id = extract_youtube_id(url)
+                if video_id: content = await get_youtube_transcript(video_id, url)
                 else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
                 if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
             else:
                 # --- Website Scraping Logic (NEW Order: Crawl4AI -> Direct+BS4 -> APIs -> Apify) ---
+                global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN, _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists, _crawl4ai_primary_scrape_enabled
+                # Method 0: Primary Scrape (Crawl4AI)
                 logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Crawl4AI)...")
                 if _crawl4ai_primary_scrape_enabled:
                     content = await get_website_content_via_crawl4ai(url)
                     if not content: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed.")
+                else: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library/driver unavailable.")
+                # Method 1: Fallback 1 (Direct Fetch + BS4)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 0 failed. Trying Method 1 (Direct Fetch + BS4)...")
                     await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
                     content = await get_website_content_direct_bs4(url)
                     if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.")
+                # Method 2: Fallback 2 (urltotext.com)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
                     if _urltotext_key_exists:
                          if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.")
                     else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
+                # Method 3: Fallback 3 (Scraper's Proxy via RapidAPI)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
                     if _rapidapi_key_exists:
                         if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.")
                     else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
+                # Method 4: Fallback 4 (AI Web Scraper via RapidAPI)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...")
                     if _rapidapi_key_exists:
                         if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.")
                     else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
+                # Method 5: Fallback 5 (Apify Website Content Crawler)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...")
                     if _apify_token_exists:
                         if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.")
                     else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.")
+                # Method 6: Fallback 6 (Apify Text Scraper Free)
                 if not content:
                     logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...")
                     if _apify_token_exists:
                         if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.")
                     else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.")
+                # Final check
                 if not content and not user_feedback_message:
+                    user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed."
             # --- 3. Summarization ---
             if content:
+                logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.")
                 try:
                      status_update_msg_id = message_to_delete_later_id or status_message_id
                      if status_update_msg_id:
                 except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
                 await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
+                final_summary = await generate_summary(content, summary_type)
                 if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
+                    user_feedback_message = final_summary
                     logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
                 else:
                     max_length = 4096
                     summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
                     await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
                     for part in summary_parts[1:]:
+                        await asyncio.sleep(0.5)
                         await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
                     success = True
                     logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
+                    user_feedback_message = None
             # --- 4. Handle Final Failure Feedback ---
+            if user_feedback_message:
                 logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
                 await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
         logger.info(f"[Task {task_id}] Task completed. Success: {success}")
+# --- Telegram Handlers ---
 async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
     user = update.effective_user; mention = user.mention_html()
     if not user or not update.message: return
                   "2. I'll ask how you want it summarised (paragraph or points).\n"
                   "3. Click the button for your choice.\n"
                   "4. Wait while I fetch the content and generate the summary!\n\n"
+                  "⚙️ I use multiple methods to get content, starting with an advanced crawler and falling back to simpler methods if needed.\n\n" # Updated help text
                   "**Commands:**\n"
                   "`/start` - Display the welcome message\n"
                   "`/help` - Show this help message" )
     url = update.message.text.strip(); user = update.effective_user
     if not user: return
     url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE)
+    if not url_pattern.search(url): logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}"); return
     match = url_pattern.search(url)
     if match:
         try:
             await update.message.reply_text(
                 f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?",
+                reply_markup=reply_markup, disable_web_page_preview=True, parse_mode=ParseMode.MARKDOWN )
         except BadRequest as e:
              if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).")
              else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}")
         except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True)
+    else: logger.debug(f"Ignoring message from {user.id} - no URL found by regex: {url[:100]}")
 async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
     if not url:
         logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Old button?")
+        try: await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.")
         except BadRequest as e:
             if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass
             else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
+        except Exception as e: logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
         return
     context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
     global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
     if not TELEGRAM_TOKEN:
         logger.critical("TELEGRAM_TOKEN missing in callback!")
+        try: await query.edit_message_text(text="❌ Bot config error (Token Missing).")
+        except Exception: pass
         return
     if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
         logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!")
+        try: await query.edit_message_text(text="❌ AI configuration error: No summarization models available.")
+        except Exception: pass
         return
     elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) unavailable, relying on fallback.")
     elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) unavailable, relying on primary.")
     logger.error("Exception while handling an update:", exc_info=context.error)
+# --- Application Setup & Web Framework ---
 async def setup_bot_config() -> Application:
     logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
                     await ptb_app.bot.set_webhook(**set_webhook_args)
                     webhook_info = await ptb_app.bot.get_webhook_info()
                     if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
+                    else: logger.error(f"Webhook URL mismatch! Expected '{full_webhook_url}', Got '{webhook_info.url}'.")
                     await ptb_app.start()
                     logger.info("PTB Application started in webhook mode.")
                 except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e
             else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL could not be determined.")
         else: logger.critical("SPACE_HOST env var not found."); raise RuntimeError("SPACE_HOST env var missing.")
+        logger.info("ASGI Lifespan: Startup complete."); yield
     except Exception as startup_err:
         logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
 async def health_check(request: Request) -> PlainTextResponse:
     """Simple health check endpoint."""
     global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
     global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled
     bot_status = "Not Initialized"; bot_username = "N/A"
         except Exception as e: bot_status = f"Error checking: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
     else: bot_status = "Not Initialized"; bot_username = "N/A"
     return PlainTextResponse(
         f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
         f"--- Summarization ---\n"
         f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
         f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
         f"--- Website Scraping ---\n"
+        f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Library/Driver Missing?'}\n"
         f"Fallback 1 (Direct+BS4): Enabled\n"
         f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
         f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
     except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
     except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to prevent TG retries
+# --- Starlette App Definition ---
 app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
 logger.info("Starlette ASGI application created with health check and webhook routes.")
+# --- Development Server ---
 if __name__ == '__main__':
     import uvicorn
     logger.warning("Running in development mode using Uvicorn directly - NOT for production!")
     try:
          from playwright.async_api import async_playwright
          logger.info("Playwright library found.")
+         # Consider adding: asyncio.run(install_playwright_drivers()) to ensure they exist
+         # async def install_playwright_drivers():
+         #     import sys
+         #     from playwright.__main__ import main
+         #     logger.info("Attempting to install Playwright browser drivers...")
+         #     try:
+         #         # Execute the playwright install command programmatically
+         #         sys.argv = ["playwright", "install", "--with-deps"]
+         #         main()
+         #         logger.info("Playwright install command finished.")
+         #     except Exception as install_err:
+         #         logger.error(f"Playwright install command failed: {install_err}")
     except ImportError:
          logger.critical("Playwright library not found. Crawl4AI will likely fail.")
          logger.critical("RUN 'pip install playwright && playwright install --with-deps' in your terminal.")