Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# main.py (Corrected
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
@@ -90,6 +90,18 @@ if not _crawl4ai_available: logger.warning("crawl4ai library not found. Primary
|
|
90 |
# --- Global variable for PTB app ---
|
91 |
ptb_app: Optional[Application] = None
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
# --- Environment Variable Loading & Configuration ---
|
94 |
logger.info("Attempting to load secrets and configuration...")
|
95 |
def get_secret(secret_name):
|
@@ -219,20 +231,11 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
|
|
219 |
|
220 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
|
221 |
params = {"token": api_token}
|
222 |
-
|
223 |
-
payload = {
|
224 |
-
"urls": [video_url],
|
225 |
-
"outputFormat": "singleStringText",
|
226 |
-
"maxRetries": 5,
|
227 |
-
"channelHandleBoolean": False,
|
228 |
-
"channelNameBoolean": False,
|
229 |
-
"datePublishedBoolean": False,
|
230 |
-
"relativeDateTextBoolean": False,
|
231 |
-
}
|
232 |
headers = {"Content-Type": "application/json"}
|
233 |
|
234 |
try:
|
235 |
-
async with httpx.AsyncClient(timeout=120.0) as client:
|
236 |
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
|
237 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
|
238 |
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
|
@@ -241,13 +244,11 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
|
|
241 |
try:
|
242 |
results = response.json()
|
243 |
if isinstance(results, list) and len(results) > 0:
|
244 |
-
item = results[0]
|
245 |
-
content = None
|
246 |
-
# Check common keys for transcript text
|
247 |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
|
248 |
elif "text" in item and isinstance(item["text"], str): content = item["text"]
|
249 |
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
|
250 |
-
elif "captions" in item and isinstance(item["captions"], list):
|
251 |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
|
252 |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
|
253 |
|
@@ -266,11 +267,10 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
|
|
266 |
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
|
267 |
|
268 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
269 |
-
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
270 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
271 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
272 |
transcript_text = None
|
273 |
-
# Method 1: youtube-transcript-api (Primary)
|
274 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
275 |
try:
|
276 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
@@ -281,7 +281,6 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
281 |
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
|
282 |
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
|
283 |
|
284 |
-
# Method 2: Supadata (Fallback 1)
|
285 |
if transcript_text is None:
|
286 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
287 |
if SUPADATA_API_KEY:
|
@@ -290,16 +289,14 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
290 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
291 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
292 |
|
293 |
-
# Method 3: Apify (Fallback 2 - Default YT Actor)
|
294 |
if transcript_text is None:
|
295 |
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
|
296 |
-
if _apify_token_exists:
|
297 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
298 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
|
299 |
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
|
300 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
301 |
|
302 |
-
# Final Result
|
303 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
304 |
return transcript_text
|
305 |
|
@@ -308,7 +305,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
308 |
# --- NEW Primary Method: Crawl4AI ---
|
309 |
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
310 |
"""Primary Web Method: Fetches and extracts content using Crawl4AI."""
|
311 |
-
global _crawl4ai_primary_scrape_enabled
|
312 |
if not _crawl4ai_primary_scrape_enabled:
|
313 |
logger.warning("[Crawl4AI Primary] Called but library is unavailable.")
|
314 |
return None
|
@@ -316,52 +313,45 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
316 |
logger.info(f"[Crawl4AI Primary] Attempting fetch and extraction for: {url}")
|
317 |
|
318 |
run_config = CrawlerRunConfig(
|
319 |
-
cache_mode=CacheMode.
|
320 |
page_timeout=60000, # 60 sec timeout
|
321 |
verbose=False,
|
322 |
-
#
|
323 |
-
# Consider adding 'wait_for' or 'scan_full_page=True' if needed for dynamic sites
|
324 |
# Consider 'remove_overlay_elements=True' for cookie banners/popups
|
325 |
)
|
326 |
-
#
|
|
|
327 |
# browser_config = BrowserConfig(headless=True, verbose=False)
|
328 |
|
329 |
extracted_text: Optional[str] = None
|
330 |
try:
|
331 |
-
# Use context manager
|
332 |
-
async with AsyncWebCrawler() as crawler:
|
|
|
333 |
logger.debug(f"[Crawl4AI Primary] Calling crawler.arun for {url}")
|
334 |
result: CrawlResult = await crawler.arun(url=url, config=run_config)
|
335 |
logger.debug(f"[Crawl4AI Primary] arun completed. Success: {result.success}, Status: {result.status_code}")
|
336 |
|
337 |
if result.success:
|
338 |
if result.markdown and isinstance(result.markdown, MarkdownGenerationResult):
|
339 |
-
# Prefer fit_markdown if it exists and has content
|
340 |
if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 30:
|
341 |
extracted_text = result.markdown.fit_markdown.strip()
|
342 |
logger.debug(f"[Crawl4AI Primary] Using fit_markdown for {url}")
|
343 |
-
# Fallback to raw_markdown
|
344 |
elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str):
|
345 |
extracted_text = result.markdown.raw_markdown.strip()
|
346 |
logger.debug(f"[Crawl4AI Primary] Using raw_markdown (fit_markdown unavailable/short) for {url}")
|
347 |
-
else:
|
348 |
-
logger.warning(f"[Crawl4AI Primary] Markdown object present but no usable text content for {url}")
|
349 |
-
# Legacy/Alternative checks (less likely with v0.5+)
|
350 |
elif result.markdown and isinstance(result.markdown, str):
|
351 |
extracted_text = result.markdown.strip()
|
352 |
logger.debug(f"[Crawl4AI Primary] Using direct result.markdown string for {url}")
|
353 |
-
elif result.cleaned_html:
|
354 |
logger.warning(f"[Crawl4AI Primary] No markdown found, parsing cleaned_html with BS4 for {url}")
|
355 |
try:
|
356 |
soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER)
|
357 |
extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
358 |
-
except Exception as bs_err:
|
359 |
-
|
360 |
-
extracted_text = None
|
361 |
-
else:
|
362 |
-
logger.warning(f"[Crawl4AI Primary] Crawl success but no markdown or cleaned_html found for {url}")
|
363 |
|
364 |
-
# Final length check
|
365 |
if extracted_text and len(extracted_text) > 30:
|
366 |
logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}")
|
367 |
return extracted_text
|
@@ -377,19 +367,17 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
377 |
except asyncio.TimeoutError:
|
378 |
logger.error(f"[Crawl4AI Primary] Timeout error during crawl for {url}")
|
379 |
return None
|
380 |
-
except ImportError as ie:
|
381 |
if "playwright" in str(ie).lower():
|
382 |
logger.critical(f"[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment. Error: {ie}")
|
383 |
-
_crawl4ai_primary_scrape_enabled = False
|
384 |
-
else:
|
385 |
-
logger.error(f"[Crawl4AI Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True)
|
386 |
return None
|
387 |
except Exception as e:
|
388 |
logger.error(f"[Crawl4AI Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True)
|
389 |
-
# Check if it's a playwright installation issue
|
390 |
if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower()):
|
391 |
logger.critical("[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment.")
|
392 |
-
_crawl4ai_primary_scrape_enabled = False
|
393 |
return None
|
394 |
|
395 |
|
@@ -402,19 +390,14 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
402 |
logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}")
|
403 |
response = await client.get(url)
|
404 |
logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}")
|
405 |
-
response.raise_for_status()
|
406 |
content_type = response.headers.get('content-type', '').lower()
|
407 |
-
if 'html' not in content_type and 'xml' not in content_type:
|
408 |
logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}")
|
409 |
-
if 'text/plain' in content_type:
|
410 |
-
logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, attempting to read.")
|
411 |
-
return response.text # Return plain text directly
|
412 |
-
return None # Skip other non-html types
|
413 |
-
try:
|
414 |
-
return response.text # Attempt to decode text, handle potential errors
|
415 |
-
except Exception as e:
|
416 |
-
logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}")
|
417 |
return None
|
|
|
|
|
418 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}")
|
419 |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}")
|
420 |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}")
|
@@ -427,51 +410,28 @@ async def get_website_content_direct_bs4(url: str) -> Optional[str]:
|
|
427 |
if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None
|
428 |
logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}")
|
429 |
html_content = await fetch_url_content_for_scrape(url)
|
430 |
-
if not html_content:
|
431 |
-
logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}.")
|
432 |
-
return None
|
433 |
try:
|
434 |
-
# --- Parsing logic (run in thread to avoid blocking) ---
|
435 |
def parse_html(content: str) -> Optional[str]:
|
436 |
try:
|
437 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
438 |
-
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]):
|
439 |
-
|
440 |
-
main_content = soup.find('main') or \
|
441 |
-
soup.find('article') or \
|
442 |
-
soup.find(role='main') or \
|
443 |
-
soup.find(id=re.compile(r'content|main|body', re.I)) or \
|
444 |
-
soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
|
445 |
target_element = main_content if main_content else soup.body
|
446 |
if not target_element:
|
447 |
-
logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content
|
448 |
text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
449 |
-
if text_from_root and len(text_from_root) > 50:
|
450 |
-
logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}.")
|
451 |
-
return text_from_root
|
452 |
return None
|
453 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
454 |
text = " ".join(lines)
|
455 |
-
if not text or len(text) < 50:
|
456 |
-
logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text seems too short or empty after cleaning for {url}. Length: {len(text)}")
|
457 |
-
return None
|
458 |
return text
|
459 |
-
except Exception as parse_e:
|
460 |
-
logger.error(f"[Web Scrape Fallback 1 Parse] Error during BeautifulSoup parsing for {url}: {parse_e}", exc_info=False) # Keep log cleaner
|
461 |
-
return None
|
462 |
-
# --- End parsing logic ---
|
463 |
-
|
464 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
return text_content
|
469 |
-
else:
|
470 |
-
logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no content for {url}.")
|
471 |
-
return None
|
472 |
-
except Exception as e:
|
473 |
-
logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing process for {url}: {e}", exc_info=True)
|
474 |
-
return None
|
475 |
|
476 |
# --- Fallback 2: urltotext.com API ---
|
477 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
@@ -510,7 +470,7 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
|
|
510 |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
|
511 |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API")
|
512 |
api_host = "scrapers-proxy2.p.rapidapi.com"
|
513 |
-
encoded_url = urllib.parse.quote(url, safe='')
|
514 |
api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
|
515 |
headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
|
516 |
try:
|
@@ -524,16 +484,12 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
|
|
524 |
content = data.get("content"); title = data.get("title"); extracted_text = ""
|
525 |
if title and isinstance(title, str): extracted_text += title.strip() + ". "
|
526 |
if content and isinstance(content, str): extracted_text += content.strip()
|
527 |
-
if extracted_text and len(extracted_text) > 30:
|
528 |
-
|
529 |
-
return extracted_text
|
530 |
-
else:
|
531 |
-
logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title seems empty or too short for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}")
|
532 |
-
return None
|
533 |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
|
534 |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
|
535 |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
536 |
-
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check
|
537 |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
|
538 |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
|
539 |
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
|
@@ -568,11 +524,11 @@ async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Opti
|
|
568 |
return None
|
569 |
except json.JSONDecodeError:
|
570 |
raw_text = response.text
|
571 |
-
if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text
|
572 |
else: logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None
|
573 |
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None
|
574 |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
575 |
-
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check
|
576 |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None
|
577 |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
|
578 |
else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
|
@@ -608,7 +564,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
608 |
elif "content" in item and isinstance(item["content"], str): content = item["content"]
|
609 |
elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"]
|
610 |
elif "html" in item and isinstance(item["html"], str):
|
611 |
-
logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text' or 'markdown' found,
|
612 |
soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
|
613 |
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
614 |
|
@@ -685,7 +641,7 @@ async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Opt
|
|
685 |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
|
686 |
"Here is the text to summarise:")
|
687 |
|
688 |
-
MAX_INPUT_LENGTH_GEMINI = 900000
|
689 |
if len(text) > MAX_INPUT_LENGTH_GEMINI:
|
690 |
logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
|
691 |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
|
@@ -770,7 +726,7 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
|
|
770 |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
|
771 |
"Here is the text to summarise:")
|
772 |
|
773 |
-
MAX_INPUT_LENGTH_OR = 100000
|
774 |
if len(text) > MAX_INPUT_LENGTH_OR:
|
775 |
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}). Truncating.");
|
776 |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
|
@@ -862,7 +818,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
862 |
|
863 |
try:
|
864 |
# --- 1. Initial User Feedback ---
|
865 |
-
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... might take a moment)..."
|
866 |
if status_message_id:
|
867 |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
868 |
except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None
|
@@ -879,33 +835,30 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
879 |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
|
880 |
|
881 |
if is_youtube:
|
882 |
-
# --- YouTube Transcript Logic
|
883 |
video_id = extract_youtube_id(url)
|
884 |
-
if video_id: content = await get_youtube_transcript(video_id, url)
|
885 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
886 |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
887 |
else:
|
888 |
# --- Website Scraping Logic (NEW Order: Crawl4AI -> Direct+BS4 -> APIs -> Apify) ---
|
889 |
-
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN
|
890 |
-
global _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists
|
891 |
-
global _crawl4ai_primary_scrape_enabled # Ensure global access
|
892 |
|
893 |
-
# Method 0: Primary Scrape (Crawl4AI
|
894 |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Crawl4AI)...")
|
895 |
if _crawl4ai_primary_scrape_enabled:
|
896 |
content = await get_website_content_via_crawl4ai(url)
|
897 |
if not content: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed.")
|
898 |
-
else:
|
899 |
-
logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library unavailable.")
|
900 |
|
901 |
-
# Method 1: Fallback 1 (Direct Fetch + BS4
|
902 |
if not content:
|
903 |
logger.warning(f"[Task {task_id}] Method 0 failed. Trying Method 1 (Direct Fetch + BS4)...")
|
904 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
905 |
content = await get_website_content_direct_bs4(url)
|
906 |
if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.")
|
907 |
|
908 |
-
# Method 2: Fallback 2 (urltotext.com
|
909 |
if not content:
|
910 |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
|
911 |
if _urltotext_key_exists:
|
@@ -914,7 +867,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
914 |
if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.")
|
915 |
else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
|
916 |
|
917 |
-
# Method 3: Fallback 3 (Scraper's Proxy via RapidAPI
|
918 |
if not content:
|
919 |
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
|
920 |
if _rapidapi_key_exists:
|
@@ -923,7 +876,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
923 |
if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.")
|
924 |
else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
|
925 |
|
926 |
-
# Method 4: Fallback 4 (AI Web Scraper via RapidAPI
|
927 |
if not content:
|
928 |
logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...")
|
929 |
if _rapidapi_key_exists:
|
@@ -932,7 +885,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
932 |
if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.")
|
933 |
else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
|
934 |
|
935 |
-
# Method 5: Fallback 5 (Apify Website Content Crawler
|
936 |
if not content:
|
937 |
logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...")
|
938 |
if _apify_token_exists:
|
@@ -941,7 +894,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
941 |
if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.")
|
942 |
else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.")
|
943 |
|
944 |
-
# Method 6: Fallback 6 (Apify Text Scraper Free
|
945 |
if not content:
|
946 |
logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...")
|
947 |
if _apify_token_exists:
|
@@ -950,14 +903,13 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
950 |
if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.")
|
951 |
else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.")
|
952 |
|
953 |
-
# Final check
|
954 |
if not content and not user_feedback_message:
|
955 |
-
user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed."
|
956 |
|
957 |
# --- 3. Summarization ---
|
958 |
if content:
|
959 |
-
logger.info(f"[Task {task_id}] Content fetched
|
960 |
-
# Update status message before summarization
|
961 |
try:
|
962 |
status_update_msg_id = message_to_delete_later_id or status_message_id
|
963 |
if status_update_msg_id:
|
@@ -965,24 +917,24 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
965 |
except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
|
966 |
|
967 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
968 |
-
final_summary = await generate_summary(content, summary_type)
|
969 |
|
970 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
971 |
-
user_feedback_message = final_summary
|
972 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
973 |
else:
|
974 |
max_length = 4096
|
975 |
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
|
976 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
|
977 |
for part in summary_parts[1:]:
|
978 |
-
await asyncio.sleep(0.5)
|
979 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
|
980 |
success = True
|
981 |
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
|
982 |
-
user_feedback_message = None
|
983 |
|
984 |
# --- 4. Handle Final Failure Feedback ---
|
985 |
-
if user_feedback_message:
|
986 |
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
|
987 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
|
988 |
|
@@ -1009,7 +961,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
1009 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
1010 |
|
1011 |
|
1012 |
-
# --- Telegram Handlers
|
1013 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1014 |
user = update.effective_user; mention = user.mention_html()
|
1015 |
if not user or not update.message: return
|
@@ -1025,7 +977,7 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
1025 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
1026 |
"3. Click the button for your choice.\n"
|
1027 |
"4. Wait while I fetch the content and generate the summary!\n\n"
|
1028 |
-
"⚙️ I
|
1029 |
"**Commands:**\n"
|
1030 |
"`/start` - Display the welcome message\n"
|
1031 |
"`/help` - Show this help message" )
|
@@ -1036,9 +988,7 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
1036 |
url = update.message.text.strip(); user = update.effective_user
|
1037 |
if not user: return
|
1038 |
url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE)
|
1039 |
-
if not url_pattern.search(url):
|
1040 |
-
logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}")
|
1041 |
-
return
|
1042 |
|
1043 |
match = url_pattern.search(url)
|
1044 |
if match:
|
@@ -1051,15 +1001,12 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
1051 |
try:
|
1052 |
await update.message.reply_text(
|
1053 |
f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?",
|
1054 |
-
reply_markup=reply_markup,
|
1055 |
-
disable_web_page_preview=True,
|
1056 |
-
parse_mode=ParseMode.MARKDOWN
|
1057 |
-
)
|
1058 |
except BadRequest as e:
|
1059 |
if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).")
|
1060 |
else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}")
|
1061 |
except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True)
|
1062 |
-
else: logger.debug(f"Ignoring message from {user.id}
|
1063 |
|
1064 |
|
1065 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
@@ -1077,13 +1024,11 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1077 |
|
1078 |
if not url:
|
1079 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Old button?")
|
1080 |
-
try:
|
1081 |
-
await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.")
|
1082 |
except BadRequest as e:
|
1083 |
if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass
|
1084 |
else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
|
1085 |
-
except Exception as e:
|
1086 |
-
logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
|
1087 |
return
|
1088 |
|
1089 |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
|
@@ -1091,19 +1036,13 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1091 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1092 |
if not TELEGRAM_TOKEN:
|
1093 |
logger.critical("TELEGRAM_TOKEN missing in callback!")
|
1094 |
-
|
1095 |
-
|
1096 |
-
await query.edit_message_text(text="❌ Bot config error (Token Missing).")
|
1097 |
-
except Exception:
|
1098 |
-
pass # Ignore if editing fails
|
1099 |
return
|
1100 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
1101 |
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!")
|
1102 |
-
|
1103 |
-
|
1104 |
-
await query.edit_message_text(text="❌ AI configuration error: No summarization models available.")
|
1105 |
-
except Exception:
|
1106 |
-
pass # Ignore if editing fails
|
1107 |
return
|
1108 |
elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) unavailable, relying on fallback.")
|
1109 |
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) unavailable, relying on primary.")
|
@@ -1119,7 +1058,7 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
|
|
1119 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
1120 |
|
1121 |
|
1122 |
-
# --- Application Setup & Web Framework
|
1123 |
|
1124 |
async def setup_bot_config() -> Application:
|
1125 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
@@ -1168,14 +1107,14 @@ async def lifespan(app: Starlette):
|
|
1168 |
await ptb_app.bot.set_webhook(**set_webhook_args)
|
1169 |
webhook_info = await ptb_app.bot.get_webhook_info()
|
1170 |
if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
|
1171 |
-
else: logger.error(f"Webhook URL mismatch
|
1172 |
await ptb_app.start()
|
1173 |
logger.info("PTB Application started in webhook mode.")
|
1174 |
except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e
|
1175 |
else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL could not be determined.")
|
1176 |
else: logger.critical("SPACE_HOST env var not found."); raise RuntimeError("SPACE_HOST env var missing.")
|
1177 |
|
1178 |
-
logger.info("ASGI Lifespan: Startup complete."); yield
|
1179 |
|
1180 |
except Exception as startup_err:
|
1181 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
@@ -1200,7 +1139,6 @@ async def lifespan(app: Starlette):
|
|
1200 |
|
1201 |
async def health_check(request: Request) -> PlainTextResponse:
|
1202 |
"""Simple health check endpoint."""
|
1203 |
-
# ADDED _crawl4ai_primary_scrape_enabled
|
1204 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1205 |
global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled
|
1206 |
bot_status = "Not Initialized"; bot_username = "N/A"
|
@@ -1221,7 +1159,6 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1221 |
except Exception as e: bot_status = f"Error checking: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
|
1222 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1223 |
|
1224 |
-
# Updated health check output
|
1225 |
return PlainTextResponse(
|
1226 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1227 |
f"--- Summarization ---\n"
|
@@ -1232,7 +1169,7 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1232 |
f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
|
1233 |
f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
|
1234 |
f"--- Website Scraping ---\n"
|
1235 |
-
f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Library/Driver Missing?'}\n"
|
1236 |
f"Fallback 1 (Direct+BS4): Enabled\n"
|
1237 |
f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
|
1238 |
f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
|
@@ -1259,20 +1196,30 @@ async def telegram_webhook(request: Request) -> Response:
|
|
1259 |
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1260 |
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to prevent TG retries
|
1261 |
|
1262 |
-
# --- Starlette App Definition
|
1263 |
app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
|
1264 |
logger.info("Starlette ASGI application created with health check and webhook routes.")
|
1265 |
|
1266 |
-
# --- Development Server
|
1267 |
if __name__ == '__main__':
|
1268 |
import uvicorn
|
1269 |
logger.warning("Running in development mode using Uvicorn directly - NOT for production!")
|
1270 |
-
# One-time check/reminder for Playwright install during local dev
|
1271 |
try:
|
1272 |
from playwright.async_api import async_playwright
|
1273 |
logger.info("Playwright library found.")
|
1274 |
-
#
|
1275 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1276 |
except ImportError:
|
1277 |
logger.critical("Playwright library not found. Crawl4AI will likely fail.")
|
1278 |
logger.critical("RUN 'pip install playwright && playwright install --with-deps' in your terminal.")
|
|
|
1 |
+
# main.py (Corrected PermissionError by setting base_directory for Crawl4AI)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
|
|
90 |
# --- Global variable for PTB app ---
|
91 |
ptb_app: Optional[Application] = None
|
92 |
|
93 |
+
# --- Define a writable base directory for Crawl4AI ---
|
94 |
+
# Use /app which is the WORKDIR in the Dockerfile
|
95 |
+
CRAWL4AI_BASE_DIR = "/app/.crawl4ai_cache"
|
96 |
+
if _crawl4ai_available:
|
97 |
+
try:
|
98 |
+
os.makedirs(CRAWL4AI_BASE_DIR, exist_ok=True)
|
99 |
+
logger.info(f"Ensured Crawl4AI base directory exists: {CRAWL4AI_BASE_DIR}")
|
100 |
+
except Exception as e:
|
101 |
+
logger.error(f"Could not create Crawl4AI base directory {CRAWL4AI_BASE_DIR}: {e}. Crawl4AI caching might fail.")
|
102 |
+
# Proceeding, but caching/DB features of Crawl4AI might not work.
|
103 |
+
|
104 |
+
|
105 |
# --- Environment Variable Loading & Configuration ---
|
106 |
logger.info("Attempting to load secrets and configuration...")
|
107 |
def get_secret(secret_name):
|
|
|
231 |
|
232 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
|
233 |
params = {"token": api_token}
|
234 |
+
payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
headers = {"Content-Type": "application/json"}
|
236 |
|
237 |
try:
|
238 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
239 |
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
|
240 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
|
241 |
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
|
|
|
244 |
try:
|
245 |
results = response.json()
|
246 |
if isinstance(results, list) and len(results) > 0:
|
247 |
+
item = results[0]; content = None
|
|
|
|
|
248 |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
|
249 |
elif "text" in item and isinstance(item["text"], str): content = item["text"]
|
250 |
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
|
251 |
+
elif "captions" in item and isinstance(item["captions"], list):
|
252 |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
|
253 |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
|
254 |
|
|
|
267 |
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
|
268 |
|
269 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
270 |
+
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
271 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
272 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
273 |
transcript_text = None
|
|
|
274 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
275 |
try:
|
276 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
|
|
281 |
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
|
282 |
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
|
283 |
|
|
|
284 |
if transcript_text is None:
|
285 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
286 |
if SUPADATA_API_KEY:
|
|
|
289 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
290 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
291 |
|
|
|
292 |
if transcript_text is None:
|
293 |
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
|
294 |
+
if _apify_token_exists:
|
295 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
296 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
|
297 |
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
|
298 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
299 |
|
|
|
300 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
301 |
return transcript_text
|
302 |
|
|
|
305 |
# --- NEW Primary Method: Crawl4AI ---
|
306 |
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
307 |
"""Primary Web Method: Fetches and extracts content using Crawl4AI."""
|
308 |
+
global _crawl4ai_primary_scrape_enabled, CRAWL4AI_BASE_DIR # Use the defined base dir
|
309 |
if not _crawl4ai_primary_scrape_enabled:
|
310 |
logger.warning("[Crawl4AI Primary] Called but library is unavailable.")
|
311 |
return None
|
|
|
313 |
logger.info(f"[Crawl4AI Primary] Attempting fetch and extraction for: {url}")
|
314 |
|
315 |
run_config = CrawlerRunConfig(
|
316 |
+
cache_mode=CacheMode.ENABLED, # Use cache now that base_dir is set
|
317 |
page_timeout=60000, # 60 sec timeout
|
318 |
verbose=False,
|
319 |
+
# Consider 'wait_for' or 'scan_full_page=True' for dynamic sites
|
|
|
320 |
# Consider 'remove_overlay_elements=True' for cookie banners/popups
|
321 |
)
|
322 |
+
# *** FIX: Pass base_directory to AsyncWebCrawler ***
|
323 |
+
# BrowserConfig defaults are usually fine (headless chromium)
|
324 |
# browser_config = BrowserConfig(headless=True, verbose=False)
|
325 |
|
326 |
extracted_text: Optional[str] = None
|
327 |
try:
|
328 |
+
# Use context manager and provide base_directory
|
329 |
+
async with AsyncWebCrawler(base_directory=CRAWL4AI_BASE_DIR) as crawler:
|
330 |
+
# Pass browser_config if needed: AsyncWebCrawler(config=browser_config, base_directory=CRAWL4AI_BASE_DIR)
|
331 |
logger.debug(f"[Crawl4AI Primary] Calling crawler.arun for {url}")
|
332 |
result: CrawlResult = await crawler.arun(url=url, config=run_config)
|
333 |
logger.debug(f"[Crawl4AI Primary] arun completed. Success: {result.success}, Status: {result.status_code}")
|
334 |
|
335 |
if result.success:
|
336 |
if result.markdown and isinstance(result.markdown, MarkdownGenerationResult):
|
|
|
337 |
if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 30:
|
338 |
extracted_text = result.markdown.fit_markdown.strip()
|
339 |
logger.debug(f"[Crawl4AI Primary] Using fit_markdown for {url}")
|
|
|
340 |
elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str):
|
341 |
extracted_text = result.markdown.raw_markdown.strip()
|
342 |
logger.debug(f"[Crawl4AI Primary] Using raw_markdown (fit_markdown unavailable/short) for {url}")
|
343 |
+
else: logger.warning(f"[Crawl4AI Primary] Markdown object present but no usable text content for {url}")
|
|
|
|
|
344 |
elif result.markdown and isinstance(result.markdown, str):
|
345 |
extracted_text = result.markdown.strip()
|
346 |
logger.debug(f"[Crawl4AI Primary] Using direct result.markdown string for {url}")
|
347 |
+
elif result.cleaned_html:
|
348 |
logger.warning(f"[Crawl4AI Primary] No markdown found, parsing cleaned_html with BS4 for {url}")
|
349 |
try:
|
350 |
soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER)
|
351 |
extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
352 |
+
except Exception as bs_err: logger.error(f"[Crawl4AI Primary] Error parsing cleaned_html with BS4 for {url}: {bs_err}"); extracted_text = None
|
353 |
+
else: logger.warning(f"[Crawl4AI Primary] Crawl success but no markdown or cleaned_html found for {url}")
|
|
|
|
|
|
|
354 |
|
|
|
355 |
if extracted_text and len(extracted_text) > 30:
|
356 |
logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}")
|
357 |
return extracted_text
|
|
|
367 |
except asyncio.TimeoutError:
|
368 |
logger.error(f"[Crawl4AI Primary] Timeout error during crawl for {url}")
|
369 |
return None
|
370 |
+
except ImportError as ie:
|
371 |
if "playwright" in str(ie).lower():
|
372 |
logger.critical(f"[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment. Error: {ie}")
|
373 |
+
_crawl4ai_primary_scrape_enabled = False
|
374 |
+
else: logger.error(f"[Crawl4AI Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True)
|
|
|
375 |
return None
|
376 |
except Exception as e:
|
377 |
logger.error(f"[Crawl4AI Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True)
|
|
|
378 |
if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower()):
|
379 |
logger.critical("[Crawl4AI Primary] Playwright drivers likely missing! Run 'playwright install' in your environment.")
|
380 |
+
_crawl4ai_primary_scrape_enabled = False
|
381 |
return None
|
382 |
|
383 |
|
|
|
390 |
logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}")
|
391 |
response = await client.get(url)
|
392 |
logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}")
|
393 |
+
response.raise_for_status()
|
394 |
content_type = response.headers.get('content-type', '').lower()
|
395 |
+
if 'html' not in content_type and 'xml' not in content_type:
|
396 |
logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}")
|
397 |
+
if 'text/plain' in content_type: logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, reading."); return response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
return None
|
399 |
+
try: return response.text
|
400 |
+
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}"); return None
|
401 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}")
|
402 |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}")
|
403 |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}")
|
|
|
410 |
if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None
|
411 |
logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}")
|
412 |
html_content = await fetch_url_content_for_scrape(url)
|
413 |
+
if not html_content: logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}."); return None
|
|
|
|
|
414 |
try:
|
|
|
415 |
def parse_html(content: str) -> Optional[str]:
|
416 |
try:
|
417 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
418 |
+
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]): element.extract()
|
419 |
+
main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
|
|
|
|
|
|
|
|
|
|
|
420 |
target_element = main_content if main_content else soup.body
|
421 |
if not target_element:
|
422 |
+
logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content for {url}")
|
423 |
text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
424 |
+
if text_from_root and len(text_from_root) > 50: logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}."); return text_from_root
|
|
|
|
|
425 |
return None
|
426 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
427 |
text = " ".join(lines)
|
428 |
+
if not text or len(text) < 50: logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text too short or empty for {url}. Length: {len(text)}"); return None
|
|
|
|
|
429 |
return text
|
430 |
+
except Exception as parse_e: logger.error(f"[Web Scrape Fallback 1 Parse] BS4 parsing error for {url}: {parse_e}", exc_info=False); return None
|
|
|
|
|
|
|
|
|
431 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
432 |
+
if text_content: logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (len: {len(text_content)})"); return text_content
|
433 |
+
else: logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no content for {url}."); return None
|
434 |
+
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing for {url}: {e}", exc_info=True); return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
|
436 |
# --- Fallback 2: urltotext.com API ---
|
437 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
|
470 |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
|
471 |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API")
|
472 |
api_host = "scrapers-proxy2.p.rapidapi.com"
|
473 |
+
encoded_url = urllib.parse.quote(url, safe='')
|
474 |
api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
|
475 |
headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
|
476 |
try:
|
|
|
484 |
content = data.get("content"); title = data.get("title"); extracted_text = ""
|
485 |
if title and isinstance(title, str): extracted_text += title.strip() + ". "
|
486 |
if content and isinstance(content, str): extracted_text += content.strip()
|
487 |
+
if extracted_text and len(extracted_text) > 30: logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy API for {url}. Len: {len(extracted_text)}"); return extracted_text
|
488 |
+
else: logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title too short/empty for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}"); return None
|
|
|
|
|
|
|
|
|
489 |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
|
490 |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
|
491 |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
492 |
+
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check subscription/limits."); return None
|
493 |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
|
494 |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
|
495 |
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
|
|
|
524 |
return None
|
525 |
except json.JSONDecodeError:
|
526 |
raw_text = response.text
|
527 |
+
if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}"); return raw_text.strip()
|
528 |
else: logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None
|
529 |
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None
|
530 |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
531 |
+
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check subscription/limits."); return None
|
532 |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None
|
533 |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
|
534 |
else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
|
|
|
564 |
elif "content" in item and isinstance(item["content"], str): content = item["content"]
|
565 |
elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"]
|
566 |
elif "html" in item and isinstance(item["html"], str):
|
567 |
+
logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text' or 'markdown' found, parsing 'html'.")
|
568 |
soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
|
569 |
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
570 |
|
|
|
641 |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
|
642 |
"Here is the text to summarise:")
|
643 |
|
644 |
+
MAX_INPUT_LENGTH_GEMINI = 900000
|
645 |
if len(text) > MAX_INPUT_LENGTH_GEMINI:
|
646 |
logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
|
647 |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
|
|
|
726 |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
|
727 |
"Here is the text to summarise:")
|
728 |
|
729 |
+
MAX_INPUT_LENGTH_OR = 100000
|
730 |
if len(text) > MAX_INPUT_LENGTH_OR:
|
731 |
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}). Truncating.");
|
732 |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
|
|
|
818 |
|
819 |
try:
|
820 |
# --- 1. Initial User Feedback ---
|
821 |
+
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... might take a moment)..."
|
822 |
if status_message_id:
|
823 |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
824 |
except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None
|
|
|
835 |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
|
836 |
|
837 |
if is_youtube:
|
838 |
+
# --- YouTube Transcript Logic ---
|
839 |
video_id = extract_youtube_id(url)
|
840 |
+
if video_id: content = await get_youtube_transcript(video_id, url)
|
841 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
842 |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
843 |
else:
|
844 |
# --- Website Scraping Logic (NEW Order: Crawl4AI -> Direct+BS4 -> APIs -> Apify) ---
|
845 |
+
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN, _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists, _crawl4ai_primary_scrape_enabled
|
|
|
|
|
846 |
|
847 |
+
# Method 0: Primary Scrape (Crawl4AI)
|
848 |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Crawl4AI)...")
|
849 |
if _crawl4ai_primary_scrape_enabled:
|
850 |
content = await get_website_content_via_crawl4ai(url)
|
851 |
if not content: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed.")
|
852 |
+
else: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library/driver unavailable.")
|
|
|
853 |
|
854 |
+
# Method 1: Fallback 1 (Direct Fetch + BS4)
|
855 |
if not content:
|
856 |
logger.warning(f"[Task {task_id}] Method 0 failed. Trying Method 1 (Direct Fetch + BS4)...")
|
857 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
858 |
content = await get_website_content_direct_bs4(url)
|
859 |
if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.")
|
860 |
|
861 |
+
# Method 2: Fallback 2 (urltotext.com)
|
862 |
if not content:
|
863 |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
|
864 |
if _urltotext_key_exists:
|
|
|
867 |
if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.")
|
868 |
else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
|
869 |
|
870 |
+
# Method 3: Fallback 3 (Scraper's Proxy via RapidAPI)
|
871 |
if not content:
|
872 |
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
|
873 |
if _rapidapi_key_exists:
|
|
|
876 |
if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.")
|
877 |
else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
|
878 |
|
879 |
+
# Method 4: Fallback 4 (AI Web Scraper via RapidAPI)
|
880 |
if not content:
|
881 |
logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...")
|
882 |
if _rapidapi_key_exists:
|
|
|
885 |
if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.")
|
886 |
else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
|
887 |
|
888 |
+
# Method 5: Fallback 5 (Apify Website Content Crawler)
|
889 |
if not content:
|
890 |
logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...")
|
891 |
if _apify_token_exists:
|
|
|
894 |
if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.")
|
895 |
else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.")
|
896 |
|
897 |
+
# Method 6: Fallback 6 (Apify Text Scraper Free)
|
898 |
if not content:
|
899 |
logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...")
|
900 |
if _apify_token_exists:
|
|
|
903 |
if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.")
|
904 |
else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.")
|
905 |
|
906 |
+
# Final check
|
907 |
if not content and not user_feedback_message:
|
908 |
+
user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed."
|
909 |
|
910 |
# --- 3. Summarization ---
|
911 |
if content:
|
912 |
+
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.")
|
|
|
913 |
try:
|
914 |
status_update_msg_id = message_to_delete_later_id or status_message_id
|
915 |
if status_update_msg_id:
|
|
|
917 |
except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
|
918 |
|
919 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
920 |
+
final_summary = await generate_summary(content, summary_type)
|
921 |
|
922 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
923 |
+
user_feedback_message = final_summary
|
924 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
925 |
else:
|
926 |
max_length = 4096
|
927 |
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
|
928 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
|
929 |
for part in summary_parts[1:]:
|
930 |
+
await asyncio.sleep(0.5)
|
931 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
|
932 |
success = True
|
933 |
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
|
934 |
+
user_feedback_message = None
|
935 |
|
936 |
# --- 4. Handle Final Failure Feedback ---
|
937 |
+
if user_feedback_message:
|
938 |
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
|
939 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
|
940 |
|
|
|
961 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
962 |
|
963 |
|
964 |
+
# --- Telegram Handlers ---
|
965 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
966 |
user = update.effective_user; mention = user.mention_html()
|
967 |
if not user or not update.message: return
|
|
|
977 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
978 |
"3. Click the button for your choice.\n"
|
979 |
"4. Wait while I fetch the content and generate the summary!\n\n"
|
980 |
+
"⚙️ I use multiple methods to get content, starting with an advanced crawler and falling back to simpler methods if needed.\n\n" # Updated help text
|
981 |
"**Commands:**\n"
|
982 |
"`/start` - Display the welcome message\n"
|
983 |
"`/help` - Show this help message" )
|
|
|
988 |
url = update.message.text.strip(); user = update.effective_user
|
989 |
if not user: return
|
990 |
url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE)
|
991 |
+
if not url_pattern.search(url): logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}"); return
|
|
|
|
|
992 |
|
993 |
match = url_pattern.search(url)
|
994 |
if match:
|
|
|
1001 |
try:
|
1002 |
await update.message.reply_text(
|
1003 |
f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?",
|
1004 |
+
reply_markup=reply_markup, disable_web_page_preview=True, parse_mode=ParseMode.MARKDOWN )
|
|
|
|
|
|
|
1005 |
except BadRequest as e:
|
1006 |
if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).")
|
1007 |
else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}")
|
1008 |
except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True)
|
1009 |
+
else: logger.debug(f"Ignoring message from {user.id} - no URL found by regex: {url[:100]}")
|
1010 |
|
1011 |
|
1012 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
1024 |
|
1025 |
if not url:
|
1026 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Old button?")
|
1027 |
+
try: await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.")
|
|
|
1028 |
except BadRequest as e:
|
1029 |
if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass
|
1030 |
else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
|
1031 |
+
except Exception as e: logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}")
|
|
|
1032 |
return
|
1033 |
|
1034 |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
|
|
|
1036 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1037 |
if not TELEGRAM_TOKEN:
|
1038 |
logger.critical("TELEGRAM_TOKEN missing in callback!")
|
1039 |
+
try: await query.edit_message_text(text="❌ Bot config error (Token Missing).")
|
1040 |
+
except Exception: pass
|
|
|
|
|
|
|
1041 |
return
|
1042 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
1043 |
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!")
|
1044 |
+
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models available.")
|
1045 |
+
except Exception: pass
|
|
|
|
|
|
|
1046 |
return
|
1047 |
elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) unavailable, relying on fallback.")
|
1048 |
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) unavailable, relying on primary.")
|
|
|
1058 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
1059 |
|
1060 |
|
1061 |
+
# --- Application Setup & Web Framework ---
|
1062 |
|
1063 |
async def setup_bot_config() -> Application:
|
1064 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
|
|
1107 |
await ptb_app.bot.set_webhook(**set_webhook_args)
|
1108 |
webhook_info = await ptb_app.bot.get_webhook_info()
|
1109 |
if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
|
1110 |
+
else: logger.error(f"Webhook URL mismatch! Expected '{full_webhook_url}', Got '{webhook_info.url}'.")
|
1111 |
await ptb_app.start()
|
1112 |
logger.info("PTB Application started in webhook mode.")
|
1113 |
except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e
|
1114 |
else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL could not be determined.")
|
1115 |
else: logger.critical("SPACE_HOST env var not found."); raise RuntimeError("SPACE_HOST env var missing.")
|
1116 |
|
1117 |
+
logger.info("ASGI Lifespan: Startup complete."); yield
|
1118 |
|
1119 |
except Exception as startup_err:
|
1120 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
|
|
1139 |
|
1140 |
async def health_check(request: Request) -> PlainTextResponse:
|
1141 |
"""Simple health check endpoint."""
|
|
|
1142 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1143 |
global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled
|
1144 |
bot_status = "Not Initialized"; bot_username = "N/A"
|
|
|
1159 |
except Exception as e: bot_status = f"Error checking: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
|
1160 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1161 |
|
|
|
1162 |
return PlainTextResponse(
|
1163 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1164 |
f"--- Summarization ---\n"
|
|
|
1169 |
f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
|
1170 |
f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n"
|
1171 |
f"--- Website Scraping ---\n"
|
1172 |
+
f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Library/Driver Missing?'}\n"
|
1173 |
f"Fallback 1 (Direct+BS4): Enabled\n"
|
1174 |
f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
|
1175 |
f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n"
|
|
|
1196 |
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1197 |
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to prevent TG retries
|
1198 |
|
1199 |
+
# --- Starlette App Definition ---
|
1200 |
app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
|
1201 |
logger.info("Starlette ASGI application created with health check and webhook routes.")
|
1202 |
|
1203 |
+
# --- Development Server ---
|
1204 |
if __name__ == '__main__':
|
1205 |
import uvicorn
|
1206 |
logger.warning("Running in development mode using Uvicorn directly - NOT for production!")
|
|
|
1207 |
try:
|
1208 |
from playwright.async_api import async_playwright
|
1209 |
logger.info("Playwright library found.")
|
1210 |
+
# Consider adding: asyncio.run(install_playwright_drivers()) to ensure they exist
|
1211 |
+
# async def install_playwright_drivers():
|
1212 |
+
# import sys
|
1213 |
+
# from playwright.__main__ import main
|
1214 |
+
# logger.info("Attempting to install Playwright browser drivers...")
|
1215 |
+
# try:
|
1216 |
+
# # Execute the playwright install command programmatically
|
1217 |
+
# sys.argv = ["playwright", "install", "--with-deps"]
|
1218 |
+
# main()
|
1219 |
+
# logger.info("Playwright install command finished.")
|
1220 |
+
# except Exception as install_err:
|
1221 |
+
# logger.error(f"Playwright install command failed: {install_err}")
|
1222 |
+
|
1223 |
except ImportError:
|
1224 |
logger.critical("Playwright library not found. Crawl4AI will likely fail.")
|
1225 |
logger.critical("RUN 'pip install playwright && playwright install --with-deps' in your terminal.")
|