Spaces:
Running
Running
# main.py (Corrected PermissionError and Integrated Crawl4AI as Primary) | |
import os | |
import re | |
import logging | |
import asyncio | |
import json | |
import html | |
import contextlib | |
import traceback | |
import urllib.parse # Added for URL encoding | |
from typing import Optional, Dict, Any, Tuple, Union # Added Union | |
# --- Frameworks --- | |
from starlette.applications import Starlette | |
from starlette.routing import Route | |
from starlette.responses import PlainTextResponse, JSONResponse, Response | |
from starlette.requests import Request | |
# --- Telegram Bot --- | |
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot | |
from telegram.ext import ( | |
Application, | |
CommandHandler, | |
MessageHandler, | |
filters, | |
ContextTypes, | |
CallbackQueryHandler, | |
) | |
from telegram.constants import ParseMode | |
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError | |
from telegram.request import HTTPXRequest, BaseRequest | |
# --- Other Libraries --- | |
import httpx | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
from bs4 import BeautifulSoup | |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log | |
try: | |
import lxml | |
DEFAULT_PARSER = 'lxml' | |
except ImportError: | |
DEFAULT_PARSER = 'html.parser' | |
# --- Google Gemini --- | |
try: | |
import google.generativeai as genai | |
from google.generativeai.types import HarmCategory, HarmBlockThreshold | |
_gemini_available = True | |
except ImportError: | |
genai = None | |
HarmCategory = None | |
HarmBlockThreshold = None | |
_gemini_available = False | |
# logger will be defined later, log warning after logger setup | |
# --- Crawl4AI (NEW Primary Scraper) --- | |
try: | |
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode, CrawlResult | |
from crawl4ai.models import MarkdownGenerationResult # Specific import for type hint | |
_crawl4ai_available = True | |
except ImportError: | |
AsyncWebCrawler = None | |
CrawlerRunConfig = None | |
BrowserConfig = None | |
CacheMode = None | |
CrawlResult = None | |
MarkdownGenerationResult = None # Corrected typo | |
_crawl4ai_available = False | |
# logger will be defined later, log warning after logger setup | |
# --- Logging Setup --- | |
logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO ) | |
logging.getLogger("httpx").setLevel(logging.WARNING) | |
logging.getLogger("telegram.ext").setLevel(logging.INFO) | |
logging.getLogger('telegram.bot').setLevel(logging.INFO) | |
logging.getLogger("urllib3").setLevel(logging.INFO) | |
logging.getLogger('gunicorn.error').setLevel(logging.INFO) | |
logging.getLogger('uvicorn').setLevel(logging.INFO) | |
logging.getLogger('starlette').setLevel(logging.INFO) | |
if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING) | |
# Keep C4AI logs less verbose unless debugging | |
if _crawl4ai_available: logging.getLogger("crawl4ai").setLevel(logging.WARNING) | |
logger = logging.getLogger(__name__) | |
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}") | |
if not _gemini_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.") | |
if not _crawl4ai_available: logger.warning("crawl4ai library not found. Primary Web Scraping (Crawl4AI) disabled.") | |
# --- Global variable for PTB app --- | |
ptb_app: Optional[Application] = None | |
# --- Define a writable base directory for Crawl4AI --- | |
# Use /app which is the WORKDIR in the Dockerfile and is generally writable | |
CRAWL4AI_BASE_DIR = "/app/.crawl4ai_cache" | |
if _crawl4ai_available: | |
try: | |
os.makedirs(CRAWL4AI_BASE_DIR, exist_ok=True) | |
logger.info(f"Ensured Crawl4AI base directory exists and is writable: {CRAWL4AI_BASE_DIR}") | |
except Exception as e: | |
# Log error but proceed, Crawl4AI might still work without cache/db | |
logger.error(f"Could not create Crawl4AI base directory {CRAWL4AI_BASE_DIR}: {e}. Crawl4AI caching/DB features might fail.") | |
# --- Environment Variable Loading & Configuration --- | |
logger.info("Attempting to load secrets and configuration...") | |
def get_secret(secret_name): | |
value = os.environ.get(secret_name) | |
if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)") | |
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}") | |
return value | |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN') | |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Summarizer Fallback | |
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Scrape Fallback 2 (WAS 1) | |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # YT Fallback 1 | |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # YT Fallback 2 + Scrape Fallbacks 5 & 6 (WAS 4 & 5) | |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY') # Scrape Fallbacks 3 & 4 (WAS 2 & 3) | |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET') | |
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer | |
# Models (User can still configure via env vars) | |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-coder-33b-instruct") # Fallback Model | |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # Default YT Actor | |
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-1.5-flash-latest") # Primary Model | |
# Specific Actor IDs for Website Scraping Fallbacks | |
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Fallback 5 (WAS 4) | |
APIFY_TEXT_SCRAPER_ACTOR_ID = "karamelo/text-scraper-free" # Fallback 6 (WAS 5) | |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.") | |
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.") | |
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.") | |
if not RAPIDAPI_KEY: logger.warning("⚠️ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks (3 & 4) will be unavailable.") # Updated numbers | |
if not APIFY_API_TOKEN: logger.warning("⚠️ WARNING: APIFY_API_TOKEN not found. YT transcript fallback (2) and Website scraping fallbacks (5 & 6) will be unavailable.") # Updated numbers | |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY) | |
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.") | |
elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY not found or empty. Gemini disabled.") | |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY) | |
if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.") | |
_crawl4ai_primary_scrape_enabled = _crawl4ai_available # Check if library loaded | |
if not _crawl4ai_available: logger.error("❌ ERROR: crawl4ai library missing. Primary web scraping disabled. Will attempt fallbacks immediately.") | |
if not URLTOTEXT_API_KEY: logger.warning("Optional secret 'URLTOTEXT_API_KEY' not found. Web scraping fallback 2 unavailable.") # Updated number | |
if not SUPADATA_API_KEY: logger.warning("Optional secret 'SUPADATA_API_KEY' not found. YT transcript fallback 1 unavailable.") | |
# APIFY_API_TOKEN warning handled above | |
# RAPIDAPI_KEY warning handled above | |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.") | |
logger.info("Secret loading and configuration check finished.") | |
logger.info(f"Primary Web Scraper (Crawl4AI): {'ENABLED' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Check Logs for Details'}") | |
logger.info(f"Using Gemini Model (Primary Summarizer): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}") | |
logger.info(f"Using OpenRouter Model (Fallback Summarizer): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}") | |
logger.info(f"Using Apify Actor (YT Default): {APIFY_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_CRAWLER_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 6): {APIFY_TEXT_SCRAPER_ACTOR_ID}") | |
_apify_token_exists = bool(APIFY_API_TOKEN) | |
_urltotext_key_exists = bool(URLTOTEXT_API_KEY) | |
_rapidapi_key_exists = bool(RAPIDAPI_KEY) | |
if _gemini_primary_enabled: | |
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.") | |
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_primary_enabled = False | |
# --- Retry Decorator (Unchanged) --- | |
async def retry_bot_operation(func, *args, **kwargs): | |
try: return await func(*args, **kwargs) | |
except BadRequest as e: | |
ignore_errors = [ "message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user", ] | |
if any(err in str(e).lower() for err in ignore_errors): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None | |
logger.error(f"Potentially critical BadRequest: {e}"); raise | |
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise | |
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise | |
# --- Helper Functions (Unchanged) --- | |
def is_youtube_url(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match) | |
def extract_youtube_id(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url) | |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id | |
else: logger.warning(f"Could not extract YT ID from {url}"); return None | |
# --- Content Fetching Functions --- | |
# --- YouTube Transcript Fetching (Unchanged) --- | |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]: | |
if not video_id: logger.error("[Supadata] No video_id provided"); return None | |
if not api_key: logger.error("[Supadata] API key missing."); return None | |
logger.info(f"[YT Fallback 1] Attempting fetch for video ID: {video_id} via Supadata") | |
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript" | |
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key} | |
try: | |
async with httpx.AsyncClient(timeout=30.0) as client: | |
response = await client.get(api_endpoint, headers=headers, params=params) | |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}") | |
if response.status_code == 200: | |
try: | |
data = response.json() if response.text else None # Check if text exists before json decode | |
content = None | |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data") | |
if not content and response.text: content = response.text # Fallback to raw text if json parse fails or content key missing | |
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None | |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None | |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None | |
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None | |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None | |
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None | |
except httpx.RequestError as e: | |
if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}") | |
else: logger.error(f"[Supadata] Request error for {video_id}: {e}") | |
return None | |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None | |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]: | |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor.""" | |
global APIFY_ACTOR_ID # Uses the default YT actor ID | |
if not video_url: logger.error("[Apify YT] No video_url provided"); return None | |
if not api_token: logger.error("[Apify YT] API token missing."); return None | |
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items" | |
params = {"token": api_token} | |
payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, } | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=120.0) as client: | |
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}") | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload) | |
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0]; content = None | |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"] | |
elif "text" in item and isinstance(item["text"], str): content = item["text"] | |
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"] | |
elif "captions" in item and isinstance(item["captions"], list): | |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text")) | |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"]) | |
if content and isinstance(content, str): logger.info(f"[Apify YT] Success via REST for {video_url}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Apify YT] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None | |
else: logger.warning(f"[Apify YT] Actor success but dataset was empty for {video_url}. Response: {results}"); return None | |
except json.JSONDecodeError: logger.error(f"[Apify YT] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None | |
except Exception as e: logger.error(f"[Apify YT] Error processing success response for {video_url}: {e}", exc_info=True); return None | |
elif response.status_code == 400: logger.error(f"[Apify YT] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None | |
elif response.status_code == 401: logger.error("[Apify YT] Auth error (401). Check token."); return None | |
elif response.status_code == 404: logger.error(f"[Apify YT] Endpoint/Actor Not Found (404). Actor: {APIFY_ACTOR_ID} Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Apify YT] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException as e: logger.error(f"[Apify YT] Timeout during API interaction for {video_url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Apify YT] HTTP Status Error during API interaction for {video_url}: {e}"); return None | |
except httpx.RequestError as e: logger.error(f"[Apify YT] Request error during API interaction for {video_url}: {e}"); return None | |
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None | |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]: | |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists | |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None | |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})") | |
transcript_text = None | |
logger.info("[Primary YT] Attempting youtube-transcript-api...") | |
try: | |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] ) | |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item]) | |
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text | |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None | |
except NoTranscriptFound: logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.") | |
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.") | |
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None | |
if transcript_text is None: | |
logger.info("[Fallback YT 1] Trying Supadata API...") | |
if SUPADATA_API_KEY: | |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY) | |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.") | |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.") | |
if transcript_text is None: | |
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...") | |
if _apify_token_exists: | |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN) | |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.") | |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.") | |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None | |
return transcript_text | |
# --- Website Content Fetching (MODIFIED SECTION) --- | |
# --- Method 0: Primary Web Scrape (Crawl4AI) --- | |
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]: | |
"""Primary Web Method: Fetches and extracts content using Crawl4AI.""" | |
global _crawl4ai_primary_scrape_enabled, CRAWL4AI_BASE_DIR # Use the defined base dir | |
if not _crawl4ai_primary_scrape_enabled: | |
logger.warning("[Web Scrape Primary] Crawl4AI called but library/driver is unavailable.") | |
return None | |
if not url: logger.error("[Web Scrape Primary] Crawl4AI: No URL provided"); return None | |
logger.info(f"[Web Scrape Primary] Attempting fetch and extraction via Crawl4AI for: {url}") | |
# Configure the crawl run - enable cache now | |
run_config = CrawlerRunConfig( | |
cache_mode=CacheMode.ENABLED, # Use cache now that base_dir is set | |
page_timeout=60000, # 60 sec timeout | |
verbose=False, # Keep logs cleaner | |
scan_full_page=True, # Try to load dynamic content by scrolling | |
remove_overlay_elements=True, # Try to remove cookie banners/popups | |
# Consider adding markdown generation strategy if needed later | |
# from crawl4ai.content_filter_strategy import PruningContentFilter | |
# from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
# md_generator = DefaultMarkdownGenerator(content_filter=PruningContentFilter()) | |
# markdown_generator=md_generator, | |
) | |
# BrowserConfig defaults are usually okay (headless chromium) | |
# browser_config = BrowserConfig(headless=True, verbose=False) | |
extracted_text: Optional[str] = None | |
try: | |
# Use context manager and provide base_directory to fix PermissionError | |
# Pass browser_config if needed: AsyncWebCrawler(config=browser_config, base_directory=CRAWL4AI_BASE_DIR) | |
async with AsyncWebCrawler(base_directory=CRAWL4AI_BASE_DIR) as crawler: | |
logger.debug(f"[Web Scrape Primary] Calling Crawl4AI crawler.arun for {url}") | |
result: CrawlResult = await crawler.arun(url=url, config=run_config) | |
logger.debug(f"[Web Scrape Primary] Crawl4AI arun completed. Success: {result.success}, Status: {result.status_code}") | |
if result.success: | |
# Check for markdown generation result first (preferred) | |
if result.markdown and isinstance(result.markdown, MarkdownGenerationResult): | |
# Prioritize 'fit_markdown' if available and substantial | |
if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 50: | |
extracted_text = result.markdown.fit_markdown.strip() | |
logger.debug(f"[Web Scrape Primary] Using 'fit_markdown' from MarkdownGenerationResult for {url}") | |
# Fallback to 'raw_markdown' if 'fit_markdown' is missing/short | |
elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str): | |
extracted_text = result.markdown.raw_markdown.strip() | |
logger.debug(f"[Web Scrape Primary] Using 'raw_markdown' (fit_markdown unavailable/short) for {url}") | |
else: | |
logger.warning(f"[Web Scrape Primary] Markdown object present but no usable text content (fit/raw) for {url}. Trying cleaned_html.") | |
# Fall through to cleaned_html parsing if markdown is unusable | |
# Handle if result.markdown is just a string (older version compatibility?) | |
elif result.markdown and isinstance(result.markdown, str): | |
extracted_text = result.markdown.strip() | |
logger.debug(f"[Web Scrape Primary] Using direct result.markdown string for {url}") | |
# If no markdown or unusable markdown, try parsing cleaned_html | |
if not extracted_text and result.cleaned_html: | |
logger.warning(f"[Web Scrape Primary] No usable markdown found, parsing cleaned_html with BS4 for {url}") | |
try: | |
# Use a simple BS4 parse as a fallback within Crawl4AI's result | |
soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER) | |
extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
except Exception as bs_err: | |
logger.error(f"[Web Scrape Primary] Error parsing Crawl4AI's cleaned_html with BS4 for {url}: {bs_err}") | |
extracted_text = None # Ensure it's None if parsing fails | |
# Final check on extracted text length | |
if extracted_text and len(extracted_text) > 50: # Check for meaningful content length | |
logger.info(f"[Web Scrape Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}") | |
return extracted_text | |
else: | |
content_len = len(extracted_text) if extracted_text else 0 | |
logger.warning(f"[Web Scrape Primary] Crawl4AI success but extracted text too short or empty for {url}. Length: {content_len}. Will try fallbacks.") | |
return None # Return None to trigger fallbacks | |
else: | |
error_msg = result.error_message or f"Crawl failed (status code: {result.status_code})" | |
logger.error(f"[Web Scrape Primary] Crawl4AI failed for {url}. Error: {error_msg}. Will try fallbacks.") | |
return None # Return None to trigger fallbacks | |
except asyncio.TimeoutError: | |
logger.error(f"[Web Scrape Primary] Timeout error during Crawl4AI crawl for {url}. Will try fallbacks.") | |
return None | |
except ImportError as ie: | |
if "playwright" in str(ie).lower(): | |
logger.critical(f"[Web Scrape Primary] Playwright library missing or drivers not installed! Run 'pip install playwright && playwright install --with-deps'. Error: {ie}") | |
_crawl4ai_primary_scrape_enabled = False # Disable future attempts | |
else: | |
logger.error(f"[Web Scrape Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True) | |
return None # Return None to trigger fallbacks | |
except Exception as e: | |
# Catch potential Playwright errors about missing executables explicitly | |
if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower() or "path" in str(e).lower()): | |
logger.critical("[Web Scrape Primary] Playwright drivers likely missing! Run 'playwright install --with-deps' in your environment. Disabling Crawl4AI.") | |
_crawl4ai_primary_scrape_enabled = False # Disable future attempts | |
else: | |
logger.error(f"[Web Scrape Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True) | |
return None # Return None to trigger fallbacks | |
# --- Fallback 1: Direct Fetch + BS4 (Previously Primary) --- | |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]: | |
"""Directly fetches URL content using httpx. (Fallback Web Method 1 - Fetching part)""" | |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' } | |
try: | |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client: | |
logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}") | |
response = await client.get(url) | |
logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}") | |
response.raise_for_status() | |
content_type = response.headers.get('content-type', '').lower() | |
if 'html' not in content_type and 'xml' not in content_type: | |
logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}") | |
if 'text/plain' in content_type: logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, reading."); return response.text | |
return None | |
try: return response.text | |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}") | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}") | |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}") | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 1] Request error fetching {url}: {e}") | |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error fetching {url}: {e}", exc_info=True) | |
return None | |
async def get_website_content_direct_bs4(url: str) -> Optional[str]: | |
"""Fallback 1: Fetches HTML directly and parses with BeautifulSoup.""" | |
if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None | |
logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}") | |
html_content = await fetch_url_content_for_scrape(url) | |
if not html_content: logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}."); return None | |
try: | |
def parse_html(content: str) -> Optional[str]: | |
try: | |
soup = BeautifulSoup(content, DEFAULT_PARSER) | |
# More aggressive removal of potentially noisy tags | |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area", "details", "dialog"]): | |
element.extract() | |
# Try common main content containers | |
main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body|post', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post|entry', re.I)) | |
target_element = main_content if main_content else soup.body | |
if not target_element: | |
logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content candidates for {url}") | |
# Fallback: Get text from the whole soup if no specific container found | |
text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
if text_from_root and len(text_from_root) > 50: | |
logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}. Length: {len(text_from_root)}") | |
return text_from_root | |
return None # Really couldn't find anything useful | |
# Extract text from the chosen element (main_content or body) | |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()] | |
text = " ".join(lines) | |
# Check if the extracted text is meaningful | |
if not text or len(text) < 50: # Increased threshold slightly | |
logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text from target element too short or empty for {url}. Length: {len(text)}") | |
# As a final attempt, try getting text from the entire soup again | |
text_from_root_final = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
if text_from_root_final and len(text_from_root_final) > 50: | |
logger.warning(f"[Web Scrape Fallback 1 Parse] Reverting to text from root as final attempt for {url}. Length: {len(text_from_root_final)}") | |
return text_from_root_final | |
return None # Give up if even root text is too short | |
return text # Return the text from the target element | |
except Exception as parse_e: | |
logger.error(f"[Web Scrape Fallback 1 Parse] BS4 parsing error for {url}: {parse_e}", exc_info=False) | |
return None | |
# Run parsing in a separate thread to avoid blocking asyncio loop | |
text_content = await asyncio.to_thread(parse_html, html_content) | |
if text_content: | |
logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (len: {len(text_content)})") | |
return text_content | |
else: | |
logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no meaningful content for {url}.") | |
return None | |
except Exception as e: | |
logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing phase for {url}: {e}", exc_info=True) | |
return None | |
# --- Fallback 2: urltotext.com API --- | |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 2: Fetches website content using urltotext.com API.""" | |
if not url: logger.error("[Web Scrape Fallback 2] No URL"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 2] urltotext.com API key missing."); return None | |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using urltotext.com API") | |
api_endpoint = "https://urltotext.com/api/v1/urltotext/" | |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False } | |
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" } | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[Web Scrape Fallback 2] Sending request to urltotext.com API for {url}") | |
response = await client.post(api_endpoint, headers=headers, json=payload) | |
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from urltotext.com API for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning") | |
if warning: logger.warning(f"[Web Scrape Fallback 2] urltotext.com API Warning for {url}: {warning}") | |
if content and isinstance(content, str) and len(content.strip()) > 30: # Check length after stripping | |
logger.info(f"[Web Scrape Fallback 2] Success via urltotext.com API for {url}. Len: {len(content.strip())}. Credits: {credits}") | |
return content.strip() | |
else: | |
content_len = len(content.strip()) if content and isinstance(content, str) else 0 | |
logger.warning(f"[Web Scrape Fallback 2] urltotext.com API success but content empty/short for {url}. Len: {content_len}. Resp: {data}"); return None | |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Web Scrape Fallback 2] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 2] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 2] Timeout connecting to urltotext.com API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 2] Request error connecting to urltotext.com API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None | |
# --- Fallback 3: Scraper's Proxy Parser via RapidAPI --- | |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 3: Fetches website content using Scraper's Proxy Parser via RapidAPI.""" | |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API") | |
api_host = "scrapers-proxy2.p.rapidapi.com" | |
encoded_url = urllib.parse.quote(url, safe='') | |
api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true" | |
headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" } | |
try: | |
async with httpx.AsyncClient(timeout=40.0) as client: | |
logger.debug(f"[Web Scrape Fallback 3] Sending GET request to {api_host} for {url}") | |
response = await client.get(api_endpoint, headers=headers) | |
logger.debug(f"[Web Scrape Fallback 3] Received status {response.status_code} from {api_host} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("content"); title = data.get("title"); extracted_text = "" | |
if title and isinstance(title, str): extracted_text += title.strip() + ". " | |
if content and isinstance(content, str): extracted_text += content.strip() | |
extracted_text = extracted_text.strip() # Strip final result | |
if extracted_text and len(extracted_text) > 30: | |
logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy API for {url}. Len: {len(extracted_text)}") | |
return extracted_text | |
else: | |
logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title too short/empty for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}") | |
return None | |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 3] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 3] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
# --- Fallback 4: AI Web Scraper via RapidAPI --- | |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 4: Fetches website content using AI Web Scraper via RapidAPI.""" | |
if not url: logger.error("[Web Scrape Fallback 4] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 4] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 4] Attempting fetch for: {url} using AI Web Scraper API") | |
api_host = "ai-web-scraper.p.rapidapi.com"; api_endpoint = f"https://{api_host}/extract_content/v1" | |
headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'x-rapidapi-host': api_host, 'x-rapidapi-key': api_key } | |
payload = {'url': url} | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[Web Scrape Fallback 4] Sending POST request to {api_host} for {url}") | |
response = await client.post(api_endpoint, headers=headers, data=payload) | |
logger.debug(f"[Web Scrape Fallback 4] Received status {response.status_code} from {api_host} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json(); content = None | |
if isinstance(data, dict): content = data.get("content") or data.get("text") or data.get("extracted_text") or data.get("result") | |
elif isinstance(data, str): content = data | |
if content and isinstance(content, str): | |
content_stripped = content.strip() | |
if len(content_stripped) > 30: | |
logger.info(f"[Web Scrape Fallback 4] Success via AI Web Scraper API for {url}. Len: {len(content_stripped)}") | |
return content_stripped | |
else: | |
logger.warning(f"[Web Scrape Fallback 4] AI Web Scraper API success but content too short after stripping for {url}. Len: {len(content_stripped)}") | |
return None | |
else: | |
keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else f"Type: {type(data)}" | |
logger.warning(f"[Web Scrape Fallback 4] AI Web Scraper API success but content empty/invalid format for {url}. {keys_info}") | |
return None | |
except json.JSONDecodeError: | |
raw_text = response.text.strip() | |
if raw_text and len(raw_text) > 30: | |
logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}") | |
return raw_text | |
else: | |
logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp empty/short:{raw_text[:500]}") | |
return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 4] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 4] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
# --- Fallback 5 & 6: Apify Website Scraping --- | |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str, fallback_num: int) -> Optional[str]: | |
"""Generic function to run an Apify actor and get text content.""" | |
if not url: logger.error(f"[{actor_name} - FB{fallback_num}] No URL provided"); return None | |
if not api_token: logger.error(f"[{actor_name} - FB{fallback_num}] API token missing."); return None | |
logger.info(f"[{actor_name} - FB{fallback_num}] Attempting fetch for URL: {url} (Actor: {actor_id})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"; params = {"token": api_token} | |
# Define different inputs based on actor | |
run_input: Dict[str, Any] | |
if actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID: | |
# Input for Text Scraper Free | |
run_input = { "urls": [url] } | |
logger.debug(f"[{actor_name} - FB{fallback_num}] Using simplified input for Text Scraper: {run_input}") | |
elif actor_id == APIFY_CRAWLER_ACTOR_ID: | |
# Input for Website Content Crawler (limit crawl depth) | |
run_input = { | |
"startUrls": [{"url": url}], | |
"maxCrawlPages": 1, # Only crawl the start URL | |
"maxCrawlDepth": 0, # Do not follow links | |
"crawlerType": "playwright:firefox", # Or chromium | |
"maxResults": 1, | |
# You might need to add parameters to extract specific content if default fails | |
# e.g., "pageFunction": "async function pageFunction(context) { return { text: document.body.innerText }; }" | |
} | |
logger.debug(f"[{actor_name} - FB{fallback_num}] Using input for Website Content Crawler: {run_input}") | |
else: | |
logger.error(f"[{actor_name} - FB{fallback_num}] Unknown Apify actor ID: {actor_id}. Cannot determine input format.") | |
return None | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=180.0) as client: # Increased timeout for Apify actors | |
logger.debug(f"[{actor_name} - FB{fallback_num}] POST Request to {sync_items_endpoint} for {url}") | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input) | |
logger.debug(f"[{actor_name} - FB{fallback_num}] Received status code {response.status_code} for {url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0]; content = None | |
# Prioritize 'text', then 'content', then 'markdown' | |
if "text" in item and isinstance(item["text"], str): content = item["text"] | |
elif "content" in item and isinstance(item["content"], str): content = item["content"] | |
elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"] | |
# Fallback: Parse 'html' if other fields are missing | |
elif "html" in item and isinstance(item["html"], str): | |
logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text', 'content', or 'markdown' found, parsing 'html'.") | |
try: | |
soup = BeautifulSoup(item["html"], DEFAULT_PARSER) | |
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
except Exception as bs_err: | |
logger.error(f"[{actor_name} - FB{fallback_num}] Error parsing Apify HTML with BS4: {bs_err}") | |
content = None # Ensure content is None if parsing fails | |
if content and isinstance(content, str): | |
content_stripped = content.strip() | |
if len(content_stripped) > 50: # Increased length check | |
logger.info(f"[{actor_name} - FB{fallback_num}] Success via REST for {url}. Length: {len(content_stripped)}") | |
return content_stripped | |
else: | |
logger.warning(f"[{actor_name} - FB{fallback_num}] Dataset item parsed but text content too short after stripping for {url}. Length: {len(content_stripped)}") | |
return None | |
else: | |
logger.warning(f"[{actor_name} - FB{fallback_num}] Dataset item parsed but text content empty or invalid format for {url}. Item keys: {list(item.keys())}") | |
return None | |
else: logger.warning(f"[{actor_name} - FB{fallback_num}] Actor success but dataset was empty for {url}. Response: {results}"); return None | |
except json.JSONDecodeError: logger.error(f"[{actor_name} - FB{fallback_num}] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None | |
except Exception as e: logger.error(f"[{actor_name} - FB{fallback_num}] Error processing success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code == 400: logger.error(f"[{actor_name} - FB{fallback_num}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None | |
elif response.status_code == 401: logger.error(f"[{actor_name} - FB{fallback_num}] Auth error (401). Check token."); return None | |
elif response.status_code == 404: logger.error(f"[{actor_name} - FB{fallback_num}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[{actor_name} - FB{fallback_num}] Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException as e: logger.error(f"[{actor_name} - FB{fallback_num}] Timeout during API interaction for {url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[{actor_name} - FB{fallback_num}] HTTP Status Error during API interaction for {url}: {e}"); return None | |
except httpx.RequestError as e: logger.error(f"[{actor_name} - FB{fallback_num}] Request error during API interaction for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[{actor_name} - FB{fallback_num}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 5: Fetches website content using Apify Website Content Crawler.""" | |
return await _run_apify_actor_for_web_content( | |
url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, | |
actor_name="Apify Crawler", fallback_num=5 | |
) | |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 6: Fetches website content using Apify Text Scraper Free.""" | |
return await _run_apify_actor_for_web_content( | |
url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, | |
actor_name="Apify Text Scraper", fallback_num=6 | |
) | |
# --- Summarization Functions (Unchanged) --- | |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call Gemini API. Returns (summary, error_message).""" | |
global GEMINI_MODEL, _gemini_primary_enabled | |
if not _gemini_primary_enabled: | |
logger.error("[Gemini Primary] Called but is disabled."); | |
return None, "Error: Primary AI service (Gemini) not configured/available." | |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}") | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
# Gemini 1.5 Flash context window is large, but let's keep a reasonable practical limit | |
MAX_INPUT_LENGTH_GEMINI = 900000 # Approx 1M tokens | |
if len(text) > MAX_INPUT_LENGTH_GEMINI: | |
logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
safety_settings = { HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, } | |
# Check if HARM_CATEGORY_CIVIC_INTEGRITY exists before adding (might vary by SDK version/region) | |
# if hasattr(HarmCategory, 'HARM_CATEGORY_CIVIC_INTEGRITY'): | |
# safety_settings[HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY] = HarmBlockThreshold.BLOCK_NONE | |
logger.debug(f"[Gemini Primary] Using safety settings: { {k.name: v.name for k, v in safety_settings.items()} }") | |
try: | |
logger.debug(f"[Gemini Primary] Initializing model {GEMINI_MODEL}") | |
model = genai.GenerativeModel(GEMINI_MODEL) | |
logger.info(f"[Gemini Primary] Sending request to Gemini ({GEMINI_MODEL})...") | |
request_options = {"timeout": 120} # 120 seconds timeout | |
response = await model.generate_content_async( | |
full_prompt, | |
generation_config=genai.types.GenerationConfig(), # Use default generation config | |
safety_settings=safety_settings, | |
request_options=request_options | |
) | |
logger.info("[Gemini Primary] Received response from Gemini.") | |
# Check for blocking based on prompt feedback first | |
if response.prompt_feedback and response.prompt_feedback.block_reason: | |
block_reason_str = getattr(response.prompt_feedback.block_reason, 'name', str(response.prompt_feedback.block_reason)) | |
logger.warning(f"[Gemini Primary] Request blocked by API based on prompt feedback. Reason: {block_reason_str}"); | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the request (Reason: {block_reason_str})." | |
# If not blocked by prompt, check candidate content and finish reason | |
summary = None | |
finish_reason_str = 'UNKNOWN' | |
safety_block_reason = None | |
if response.candidates: | |
candidate = response.candidates[0] | |
finish_reason_enum = getattr(candidate, 'finish_reason', None) | |
finish_reason_str = getattr(finish_reason_enum, 'name', 'N/A') if finish_reason_enum else 'N/A' | |
if finish_reason_str == 'SAFETY': | |
safety_ratings_str = "N/A" | |
if hasattr(candidate, 'safety_ratings'): | |
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in candidate.safety_ratings]) | |
safety_block_reason = f"SAFETY (Ratings: [{safety_ratings_str}])" | |
logger.warning(f"[Gemini Primary] Candidate blocked due to SAFETY. Finish Reason: {finish_reason_str}. {safety_block_reason}") | |
# Don't return yet, check if response.text fallback works | |
elif finish_reason_str not in ['STOP', 'MAX_TOKENS', 'N/A', None]: # Log unusual reasons | |
logger.warning(f"[Gemini Primary] Candidate finished with non-standard reason: {finish_reason_str}") | |
# Try extracting text from the candidate parts | |
if candidate.content and candidate.content.parts: | |
summary = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text')) | |
# Fallback to response.text if candidate parsing failed or was blocked (but prompt wasn't) | |
if summary is None: | |
try: | |
# This might raise ValueError if the response was fully blocked (e.g., safety) | |
summary = response.text | |
if safety_block_reason: # If we got here despite a safety block, log it | |
logger.warning(f"[Gemini Primary] Got text via response.text despite SAFETY block reason: {safety_block_reason}") | |
except ValueError as e: | |
logger.warning(f"[Gemini Primary] Error accessing response.text (likely blocked response): {e}. Final Finish Reason: {finish_reason_str}") | |
summary = None # Ensure summary is None if .text fails | |
# Final check and return | |
if summary: | |
logger.info(f"[Gemini Primary] Success generating summary. Finish Reason: {finish_reason_str}. Output len: {len(summary)}"); | |
return summary.strip(), None | |
else: | |
# Provide a more specific error if safety was the likely cause | |
error_msg = f"Sorry, the primary AI model ({GEMINI_MODEL}) did not provide a summary (Finish Reason: {finish_reason_str})." | |
if safety_block_reason: | |
error_msg = f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the response due to safety filters ({finish_reason_str})." | |
logger.warning(f"[Gemini Primary] Gemini returned empty summary or content was blocked. Final Finish Reason: {finish_reason_str}. Safety Block Reason: {safety_block_reason}"); | |
return None, error_msg | |
except AttributeError as ae: | |
# This might happen if the SDK response structure changes | |
logger.error(f"[Gemini Primary] AttributeError during Gemini response processing: {ae}. SDK might be incompatible or response structure unexpected.", exc_info=True) | |
return None, f"Sorry, error processing response from the primary AI ({GEMINI_MODEL})." | |
except Exception as e: | |
# Catch potential network errors, timeouts, etc. | |
logger.error(f"[Gemini Primary] Unexpected error during Gemini API call: {e}", exc_info=True) | |
return None, f"Sorry, unexpected error using primary AI ({GEMINI_MODEL})." | |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call OpenRouter API (Fallback). Returns (summary, error_message).""" | |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled | |
if not _openrouter_fallback_enabled: | |
logger.error("[OpenRouter Fallback] Called but is disabled."); | |
return None, "Error: Fallback AI service (OpenRouter) not configured/available." | |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}") | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
# Check model context window if known, otherwise use a generous limit | |
# Deepseek Coder 33B has 16k context, let's aim lower for safety | |
MAX_INPUT_LENGTH_OR = 60000 # Roughly 15k tokens | |
if len(text) > MAX_INPUT_LENGTH_OR: | |
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}) for {OPENROUTER_MODEL}. Truncating."); | |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
headers = { | |
"Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
"Content-Type": "application/json", | |
# Optional, but good practice for OpenRouter identification | |
"HTTP-Referer": "https://github.com/your-repo-or-app-name", # Replace with your repo/app URL | |
"X-Title": "TelegramSummariserBot" # Replace with your app name | |
} | |
payload = { | |
"model": OPENROUTER_MODEL, | |
"messages": [{"role": "user", "content": full_prompt}] | |
# Add optional parameters like temperature, max_tokens if needed | |
# "temperature": 0.7, | |
# "max_tokens": 1024, | |
} | |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions" | |
# Increased read timeout as some models can take time | |
api_timeouts = httpx.Timeout(connect=10.0, read=90.0, write=10.0, pool=60.0); | |
response = None | |
try: | |
async with httpx.AsyncClient(timeout=api_timeouts) as client: | |
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_MODEL})...") | |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload) | |
logger.info(f"[OpenRouter Fallback] Received response. Status: {response.status_code}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0: | |
choice = data["choices"][0] | |
message = choice.get("message") | |
finish_reason = choice.get("finish_reason", "N/A") | |
if message and isinstance(message, dict): | |
summary = message.get("content") | |
if summary: | |
logger.info(f"[OpenRouter Fallback] Success. Finish: {finish_reason}. Output len: {len(summary)}") | |
return summary.strip(), None | |
else: | |
# Model might return empty content successfully | |
logger.warning(f"[OpenRouter Fallback] Success but content empty. Finish: {finish_reason}. Resp: {data}") | |
return None, f"Fallback AI ({OPENROUTER_MODEL}) returned empty summary (Finish: {finish_reason})." | |
else: | |
logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Finish: {finish_reason}. Full: {data}") | |
return None, "Could not parse fallback AI response (message format)." | |
else: | |
# Check for specific OpenRouter errors in the response body | |
error_details = data.get("error", {}) | |
error_msg = error_details.get("message", "Unknown error in response structure") | |
logger.error(f"[OpenRouter Fallback] Unexpected choices structure or error in response. Error: {error_msg}. Full: {data}") | |
return None, f"Fallback AI response error: {error_msg}." | |
except json.JSONDecodeError: | |
logger.error(f"[OpenRouter Fallback] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:500]}") | |
return None, "Failed to understand fallback AI response." | |
except Exception as e: | |
logger.error(f"[OpenRouter Fallback] Error processing success response: {e}", exc_info=True) | |
return None, "Error processing fallback AI response." | |
# Handle specific HTTP error codes | |
elif response.status_code == 401: | |
logger.error("[OpenRouter Fallback] API key invalid or missing (401).") | |
return None, "Fallback AI authentication failed (check key)." | |
elif response.status_code == 402: | |
logger.error("[OpenRouter Fallback] Payment Required/Quota Exceeded (402).") | |
return None, f"Fallback AI ({OPENROUTER_MODEL}) quota/limit reached." | |
elif response.status_code == 429: | |
logger.warning(f"[OpenRouter Fallback] Rate Limit Exceeded (429) for {OPENROUTER_MODEL}.") | |
return None, f"Fallback AI ({OPENROUTER_MODEL}) is rate-limited. Try again later." | |
elif response.status_code == 500: | |
logger.error(f"[OpenRouter Fallback] OpenRouter Internal Server Error (500). Resp:{response.text[:500]}") | |
return None, f"Fallback AI service ({OPENROUTER_MODEL}) encountered an internal error." | |
else: | |
# General unexpected status code | |
error_info = "" | |
try: # Try to get error message from JSON response | |
error_info = response.json().get("error", {}).get("message", "") | |
except Exception: pass | |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}"); | |
return None, f"Fallback AI ({OPENROUTER_MODEL}) returned error status {response.status_code}." | |
except httpx.TimeoutException as e: | |
logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting to or reading from OpenRouter API: {e}") | |
return None, f"Fallback AI ({OPENROUTER_MODEL}) timed out." | |
except httpx.RequestError as e: | |
logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}") | |
return None, "Error connecting to fallback AI service." | |
except Exception as e: | |
logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True) | |
return None, "Unexpected error using fallback AI service." | |
async def generate_summary(text: str, summary_type: str) -> str: | |
"""Generates summary using Gemini (Primary) and falls back to OpenRouter if needed.""" | |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL | |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})") | |
final_summary: Optional[str] = None; primary_error_message: Optional[str] = None | |
if _gemini_primary_enabled: | |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})") | |
final_summary, primary_error_message = await _call_gemini(text, summary_type) | |
if final_summary: logger.info("[Summary Generation] Success with primary AI (Gemini)."); return final_summary | |
else: logger.warning(f"[Summary Generation] Primary AI (Gemini) failed. Error: {primary_error_message}. Proceeding to fallback.") | |
else: logger.warning("[Summary Generation] Primary AI (Gemini) disabled. Proceeding to fallback."); primary_error_message = "Primary AI (Gemini) unavailable." | |
if _openrouter_fallback_enabled: | |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})") | |
fallback_summary, fallback_error_message = await _call_openrouter(text, summary_type) | |
if fallback_summary: logger.info("[Summary Generation] Success with fallback AI (OpenRouter)."); return fallback_summary | |
else: | |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error_message}") | |
# Construct a combined error message | |
primary_err = primary_error_message or "Primary AI unavailable" | |
fallback_err = fallback_error_message or "Fallback AI failed with unknown error" | |
return f"Sorry, summarization failed.\nPrimary: {primary_err}\nFallback ({OPENROUTER_MODEL}): {fallback_err}" | |
else: | |
logger.error("[Summary Generation] Fallback AI (OpenRouter) disabled. Cannot proceed.") | |
if primary_error_message: return f"{primary_error_message} Fallback AI is also unavailable." | |
else: return "Error: Both primary and fallback AI services are unavailable." | |
# This line should technically not be reached if logic is sound | |
logger.error("[Summary Generation] Reached end of function unexpectedly.") | |
return "Sorry, unknown error during summary generation." | |
# --- Main Processing Logic (MODIFIED with Crawl4AI and re-ordered fallbacks) --- | |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None: | |
"""Handles the entire process: fetching content (Crawl4AI -> Fallbacks) and summarizing.""" | |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}") | |
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None | |
try: | |
# Use longer timeouts for the background bot to handle potentially long scrapes/summaries | |
background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=240.0, write_timeout=60.0, pool_timeout=240.0 ) | |
bot = Bot(token=bot_token, request=background_request) | |
except Exception as e: | |
logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True) | |
# We cannot proceed without a bot instance | |
return | |
content: Optional[str] = None | |
user_feedback_message: Optional[str] = None | |
success: bool = False | |
# Use the original button message ID if available, otherwise we'll send a new one | |
status_message_id: Optional[int] = message_id_to_edit | |
# Keep track if we sent a *new* message that needs deleting (vs editing the button message) | |
new_status_message_id : Optional[int] = None | |
try: | |
# --- 1. Initial User Feedback --- | |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... this might take a minute)..." | |
if status_message_id: | |
try: | |
# Edit the message containing the buttons | |
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ) | |
logger.debug(f"[Task {task_id}] Edited button message {status_message_id} to 'Processing'") | |
except (BadRequest, TelegramError) as e: | |
# Common errors: message not modified, message to edit not found, query too old | |
logger.warning(f"[Task {task_id}] Could not edit original button message {status_message_id}: {e}. Will send a new status message.") | |
status_message_id = None # Ensure we send a new message if edit fails | |
except Exception as e: | |
logger.error(f"[Task {task_id}] Unexpected error editing button message {status_message_id}: {e}. Will send new.", exc_info=True) | |
status_message_id = None | |
# If we couldn't edit the original message, send a new one | |
if not status_message_id: | |
try: | |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN ) | |
if status_message: | |
new_status_message_id = status_message.message_id | |
logger.debug(f"[Task {task_id}] Sent new status message {new_status_message_id}") | |
else: | |
# This should ideally not happen due to retry_bot_operation, but handle defensively | |
raise RuntimeError("Failed to send new status message after retries.") | |
except Exception as e: | |
# If we can't even send a status message, we can't proceed meaningfully | |
logger.error(f"[Task {task_id}] CRITICAL: Failed to send initial status message: {e}. Aborting task.", exc_info=True) | |
# Attempt to clean up the original button message if it exists | |
if message_id_to_edit: | |
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=message_id_to_edit) | |
except Exception: pass | |
raise # Re-raise to be caught by outer try/finally | |
# Determine which message ID to update/delete later | |
message_to_update_id = new_status_message_id or status_message_id | |
try: | |
# --- 2. Content Fetching (Chain of methods) --- | |
# Send typing indicator | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}") | |
if is_youtube: | |
# --- YouTube Transcript Logic (Unchanged from original) --- | |
video_id = extract_youtube_id(url) | |
if video_id: | |
content = await get_youtube_transcript(video_id, url) | |
else: | |
user_feedback_message = "Sorry, I couldn't understand that YouTube URL format." | |
# Set feedback message if transcript fetch failed | |
if not content and not user_feedback_message: | |
user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)." | |
else: | |
# --- Website Scraping Logic (NEW Order: Crawl4AI -> Fallbacks) --- | |
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN | |
global _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists, _crawl4ai_primary_scrape_enabled | |
# Method 0: Primary Scrape (Crawl4AI) | |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Primary: Crawl4AI)...") | |
if _crawl4ai_primary_scrape_enabled: | |
content = await get_website_content_via_crawl4ai(url) | |
if content: | |
logger.info(f"[Task {task_id}] Method 0 (Crawl4AI) succeeded.") | |
else: | |
logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed or returned insufficient content.") | |
# Edit status message to indicate fallback attempt | |
if message_to_update_id: | |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=message_to_update_id, text="Primary scrape method failed, trying fallbacks...", parse_mode=ParseMode.MARKDOWN) | |
except Exception: pass # Ignore if edit fails | |
else: | |
logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library/driver unavailable.") | |
# Edit status message | |
if message_to_update_id: | |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=message_to_update_id, text="Primary scrape method unavailable, trying fallbacks...", parse_mode=ParseMode.MARKDOWN) | |
except Exception: pass | |
# Method 1: Fallback 1 (Direct Fetch + BS4) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 0 failed/skipped. Trying Method 1 (Direct Fetch + BS4)...") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_direct_bs4(url) | |
if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.") | |
# Method 2: Fallback 2 (urltotext.com) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...") | |
if _urltotext_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.") | |
else: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.") | |
# Method 3: Fallback 3 (Scraper's Proxy via RapidAPI) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_scrapers_proxy(url, RAPIDAPI_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.") | |
else: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.") | |
# Method 4: Fallback 4 (AI Web Scraper via RapidAPI) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_ai_web_scraper(url, RAPIDAPI_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.") | |
else: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.") | |
# Method 5: Fallback 5 (Apify Website Content Crawler) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_crawler(url, APIFY_API_TOKEN) | |
if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.") | |
else: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.") | |
# Method 6: Fallback 6 (Apify Text Scraper Free) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_text_scraper(url, APIFY_API_TOKEN) | |
if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.") | |
else: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.") | |
# Final check if all website methods failed | |
if not content and not user_feedback_message: | |
logger.error(f"[Task {task_id}] All web scraping methods failed for {url}.") | |
user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed." | |
# --- 3. Summarization --- | |
if content: | |
logger.info(f"[Task {task_id}] Content fetched successfully (len:{len(content)}). Generating '{summary_type}' summary.") | |
# Update status message before starting potentially long summary generation | |
if message_to_update_id: | |
try: | |
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=message_to_update_id, text=f"Content fetched! Now generating '{summary_type}' summary with AI...", parse_mode=ParseMode.MARKDOWN, reply_markup=None ) | |
except Exception as edit_e: | |
logger.warning(f"[Task {task_id}] Failed to edit status message before summary generation: {edit_e}") | |
# Send typing indicator again for summary generation | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
final_summary = await generate_summary(content, summary_type) | |
# Check if summary generation itself returned an error message | |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): | |
user_feedback_message = final_summary # Use the error message from generate_summary | |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}") | |
else: | |
# Summary successful, send it (potentially in parts) | |
max_length = 4096 # Telegram message length limit | |
if len(final_summary) <= max_length: | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=final_summary, parse_mode=None, link_preview_options={'is_disabled': True} ) | |
else: | |
# Split into parts | |
summary_parts = [] | |
current_part = "" | |
for line in final_summary.splitlines(keepends=True): | |
if len(current_part) + len(line) > max_length: | |
summary_parts.append(current_part) | |
current_part = line | |
else: | |
current_part += line | |
if current_part: # Add the last part | |
summary_parts.append(current_part) | |
logger.info(f"[Task {task_id}] Summary too long ({len(final_summary)} chars), splitting into {len(summary_parts)} parts.") | |
for i, part in enumerate(summary_parts): | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} ) | |
if i < len(summary_parts) - 1: | |
await asyncio.sleep(0.7) # Short delay between parts | |
success = True | |
logger.info(f"[Task {task_id}] Successfully sent summary.") | |
user_feedback_message = None # Clear any previous potential error message | |
# --- 4. Handle Final Failure Feedback --- | |
# If we have a user_feedback_message set at this point, it means something failed | |
# (either content fetching or summarization) | |
if user_feedback_message: | |
logger.warning(f"[Task {task_id}] Process failed. Sending failure feedback: {user_feedback_message}") | |
# Send the failure message as a new message | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} ) | |
except Exception as e: | |
# Catch any unexpected errors during the main processing block | |
logger.error(f"[Task {task_id}] Unexpected error during core processing: {e}", exc_info=True) | |
user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later." | |
try: | |
# Try to send a generic error message | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message ) | |
except Exception as feedback_err: | |
logger.error(f"[Task {task_id}] Failed even to send the generic error feedback message: {feedback_err}") | |
success = False # Ensure success is false | |
except Exception as outer_e: | |
# Catch critical errors (like failure to send initial status message) | |
logger.critical(f"[Task {task_id}] Critical outer error prevented task execution: {outer_e}", exc_info=True) | |
try: | |
if bot: # Check if bot was initialized | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ A critical internal error occurred. I couldn't process your request." ) | |
except Exception as crit_feedback_err: | |
logger.exception(f"[Task {task_id}] Failed even to send the critical error message: {crit_feedback_err}") | |
success = False # Ensure success is false | |
finally: | |
# --- 5. Cleanup --- | |
# Delete the status message we were updating (either the original button message or the new one we sent) | |
delete_target_id = new_status_message_id if new_status_message_id else status_message_id | |
if delete_target_id and bot: | |
try: | |
await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id) | |
logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}") | |
except (BadRequest, TelegramError) as del_e: | |
# Ignore errors like "message to delete not found" | |
if "not found" not in str(del_e).lower(): | |
logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}") | |
except Exception as del_e: | |
logger.warning(f"[Task {task_id}] Unexpected error deleting status/button message {delete_target_id}: {del_e}") | |
# Close the background bot's HTTPX client if it was created | |
if background_request and hasattr(background_request, '_client') and background_request._client: | |
try: | |
await background_request._client.aclose() | |
logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.") | |
except Exception as close_e: | |
logger.warning(f"[Task {task_id}] Error closing background bot's client: {close_e}") | |
logger.info(f"[Task {task_id}] Task finished. Overall Success: {success}") | |
# --- Telegram Handlers --- | |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user; mention = user.mention_html() | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /start.") | |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" ) | |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /help.") | |
help_text = ( "🔍 **How to use this bot:**\n\n" | |
"1. Send me any YouTube video link or website URL.\n" | |
"2. I'll ask how you want it summarised (paragraph or points).\n" | |
"3. Click the button for your choice.\n" | |
"4. Wait while I fetch the content and generate the summary!\n\n" | |
"⚙️ **Website Scraping:** I use an advanced web crawler (`crawl4ai`) first. If that doesn't work, I'll try several fallback methods (direct fetch, APIs) to get the text.\n" | |
"📺 **YouTube:** I try the official library first, then fall back to APIs if needed.\n" | |
"🤖 **Summaries:** I use Google Gemini primarily, with OpenRouter as a backup.\n\n" | |
"**Commands:**\n" | |
"`/start` - Display the welcome message\n" | |
"`/help` - Show this help message" ) | |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN) | |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
if not update.message or not update.message.text: return | |
message_text = update.message.text.strip(); user = update.effective_user | |
if not user: return | |
# More robust URL extraction using regex - finds the first http(s) link | |
url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE) | |
match = url_pattern.search(message_text) | |
if match: | |
extracted_url = match.group(0) | |
# Clean potential trailing characters like periods or parentheses if message contained more text | |
extracted_url = extracted_url.rstrip(').,') | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) sent potential URL: {extracted_url}") | |
# Store URL and original message ID in user_data for the callback | |
context.user_data['url_to_summarize'] = extracted_url | |
context.user_data['original_message_id'] = update.message.message_id # Store original message ID if needed later | |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]] | |
reply_markup = InlineKeyboardMarkup(keyboard) | |
try: | |
# Reply to the original message | |
await update.message.reply_text( | |
f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?", | |
reply_markup=reply_markup, | |
disable_web_page_preview=True, | |
parse_mode=ParseMode.MARKDOWN | |
) | |
except BadRequest as e: | |
if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): | |
logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).") | |
else: | |
logger.error(f"BadRequest replying to URL message from {user.id}: {e}") | |
except Exception as e: | |
logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True) | |
else: | |
# If the message filter passed but regex didn't find a URL, log it but don't reply | |
logger.debug(f"Ignoring message from {user.id} - Entity filter matched but no URL found by regex: {message_text[:100]}") | |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
query = update.callback_query | |
if not query or not query.message or not query.from_user: | |
logger.warning("Callback query received without essential data.") | |
# Attempt to answer the query even if we can't process it, to remove the loading indicator | |
if query: | |
try: await query.answer("Error: Missing data.", show_alert=True) | |
except Exception: pass | |
return | |
user = query.from_user | |
summary_type = query.data | |
query_id = query.id | |
chat_id = query.message.chat_id | |
message_id_to_edit = query.message.message_id # This is the message with the buttons | |
try: | |
# Acknowledge the button press quickly | |
await query.answer() | |
logger.debug(f"Acknowledged callback {query_id} from {user.id} for summary type '{summary_type}'") | |
except BadRequest as e: | |
if "query is too old" in str(e).lower(): | |
logger.warning(f"Callback query {query_id} is too old to answer. User might have double-clicked or waited too long.") | |
# Optionally edit the message to indicate the issue if possible | |
try: await query.edit_message_text(text="This request is too old. Please send the link again.", reply_markup=None) | |
except Exception: pass | |
return # Stop processing if the query is too old | |
else: | |
# Log other BadRequest errors but attempt to continue if acknowledging failed | |
logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) | |
except Exception as e: | |
logger.error(f"Unexpected error answering callback {query_id}: {e}", exc_info=True) | |
# Attempt to continue processing even if answering failed | |
# Retrieve the URL stored in user_data | |
url = context.user_data.get('url_to_summarize') | |
logger.info(f"User {user.id} chose '{summary_type}' for button message {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}") | |
if not url: | |
logger.warning(f"No URL found in context for user {user.id} (callback query {query_id}). Button might be old or context lost.") | |
try: | |
# Edit the button message to inform the user | |
await query.edit_message_text( | |
text="Sorry, I couldn't find the original URL for this request (it might be too old or the bot restarted). Please send the link again.", | |
reply_markup=None # Remove buttons | |
) | |
except (BadRequest, TelegramError) as edit_e: | |
# Ignore errors like "message is not modified" or "message to edit not found" | |
if "not modified" not in str(edit_e).lower() and "not found" not in str(edit_e).lower(): | |
logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {edit_e}") | |
except Exception as edit_e: | |
logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {edit_e}") | |
return # Stop processing if URL is missing | |
# Clear the URL from context once retrieved to prevent accidental reuse | |
context.user_data.pop('url_to_summarize', None) | |
context.user_data.pop('original_message_id', None) # Clear original message ID too | |
logger.debug(f"Cleared URL context for user {user.id}") | |
# --- Pre-task Checks --- | |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled | |
if not TELEGRAM_TOKEN: | |
logger.critical("FATAL: TELEGRAM_TOKEN missing when trying to start background task!") | |
try: await query.edit_message_text(text="❌ Critical Bot Configuration Error (Missing Token). Cannot proceed.", reply_markup=None) | |
except Exception: pass | |
return | |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled: | |
logger.critical("FATAL: Neither Gemini nor OpenRouter API keys are configured/valid when trying to start background task!") | |
try: await query.edit_message_text(text="❌ Critical AI Configuration Error: No summarization models available. Cannot proceed.", reply_markup=None) | |
except Exception: pass | |
return | |
elif not _gemini_primary_enabled: | |
logger.warning("Primary AI (Gemini) unavailable, relying solely on fallback for this task.") | |
elif not _openrouter_fallback_enabled: | |
logger.warning("Fallback AI (OpenRouter) unavailable, relying solely on primary for this task.") | |
# --- Schedule Background Task --- | |
logger.info(f"Scheduling background task for user {user.id}, chat {chat_id}, button message {message_id_to_edit}, url: {url[:60]}...") | |
asyncio.create_task( | |
process_summary_task( | |
user_id=user.id, | |
chat_id=chat_id, | |
message_id_to_edit=message_id_to_edit, # Pass the button message ID | |
url=url, | |
summary_type=summary_type, | |
bot_token=TELEGRAM_TOKEN | |
), | |
# Name the task for easier debugging if needed | |
name=f"SummaryTask-{user.id}-{message_id_to_edit}" | |
) | |
# Note: The process_summary_task will handle editing/deleting the message_id_to_edit | |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Log Errors caused by Updates.""" | |
logger.error("Exception while handling an update:", exc_info=context.error) | |
# Optionally add more context if 'update' is an Update object | |
if isinstance(update, Update) and update.effective_chat: | |
logger.error(f"Error occurred in chat {update.effective_chat.id}") | |
# --- Application Setup & Web Framework --- | |
async def setup_bot_config() -> Application: | |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN | |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.") | |
# Configure HTTPX request settings for the main PTB application | |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 ) | |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build() | |
# --- Add Handlers --- | |
application.add_handler(CommandHandler("start", start)) | |
application.add_handler(CommandHandler("help", help_command)) | |
# Use a filter that catches messages containing URL entities | |
url_filter = filters.Entity("url") | filters.Entity("text_link") | |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND & url_filter, handle_potential_url)) | |
# Handler for button clicks (summary type selection) | |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback)) | |
# Error handler | |
application.add_error_handler(error_handler) | |
logger.info("Telegram application handlers configured."); return application | |
async def lifespan(app: Starlette): | |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN | |
logger.info("ASGI Lifespan: Startup initiated..."); | |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.") | |
bot_setup_successful = False | |
webhook_set = False | |
try: | |
ptb_app = await setup_bot_config() | |
await ptb_app.initialize() | |
bot_info = await ptb_app.bot.get_me() | |
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})") | |
bot_setup_successful = True # Mark bot setup as successful here | |
# --- Webhook Setup --- | |
# Check and delete existing webhook first | |
current_webhook_info = await ptb_app.bot.get_webhook_info() | |
if current_webhook_info and current_webhook_info.url: | |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete...") | |
try: | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): | |
logger.info("Existing webhook deleted successfully.") | |
else: | |
# API returned False, might not be critical but worth noting | |
logger.warning("Attempt to delete existing webhook returned False from API.") | |
except Exception as e: | |
logger.warning(f"Could not delete existing webhook (Error: {e}). Proceeding with setting new webhook.", exc_info=True) | |
await asyncio.sleep(1) # Short delay after potential delete | |
# Determine webhook URL (assuming deployment provides SPACE_HOST) | |
space_host = os.environ.get("SPACE_HOST") | |
if not space_host: | |
logger.critical("SPACE_HOST environment variable not found. Cannot set webhook.") | |
raise RuntimeError("SPACE_HOST environment variable missing.") | |
webhook_path = "/webhook" # Matches the route defined later | |
# Ensure correct protocol and clean host formatting | |
protocol = "https" | |
host = space_host.split('://')[-1].rstrip('/') # Remove trailing slashes | |
full_webhook_url = f"{protocol}://{host}{webhook_path}" | |
logger.info(f"Calculated webhook URL: {full_webhook_url}") | |
# Set the new webhook | |
set_webhook_args = { | |
"url": full_webhook_url, | |
"allowed_updates": Update.ALL_TYPES, # Receive all update types | |
"drop_pending_updates": True # Ignore updates while bot was down | |
} | |
if WEBHOOK_SECRET: | |
set_webhook_args["secret_token"] = WEBHOOK_SECRET | |
logger.info("Webhook secret token will be used.") | |
else: | |
logger.info("No webhook secret token configured.") | |
# Give network/DNS a moment before setting | |
await asyncio.sleep(1.5) | |
logger.info(f"Attempting to set webhook to: {full_webhook_url} with args: {set_webhook_args}") | |
await ptb_app.bot.set_webhook(**set_webhook_args) | |
# Verify webhook setup | |
await asyncio.sleep(1) # Allow time for info propagation | |
new_webhook_info = await ptb_app.bot.get_webhook_info() | |
if new_webhook_info.url == full_webhook_url: | |
logger.info(f"Webhook successfully set: URL='{new_webhook_info.url}', Secret Token Set={bool(WEBHOOK_SECRET)}") | |
webhook_set = True | |
else: | |
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', but GET response shows '{new_webhook_info.url}'. Check firewall/proxy/platform settings.") | |
# Decide whether to raise an error or try to continue | |
# For now, let's raise an error as webhook is critical | |
raise RuntimeError("Failed to verify webhook URL after setting.") | |
# Start the PTB application processing | |
await ptb_app.start() | |
logger.info("PTB Application started and polling for updates via webhook.") | |
logger.info("ASGI Lifespan: Startup complete."); yield # Application runs here | |
except Exception as startup_err: | |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True) | |
# Attempt cleanup even if startup failed partially | |
if ptb_app and bot_setup_successful: | |
if ptb_app.running: | |
try: await ptb_app.stop() | |
except Exception as stop_err: logger.error(f"Error stopping PTB app during failed startup: {stop_err}") | |
# Try to delete webhook if it was potentially set | |
if webhook_set: | |
try: | |
logger.info("Attempting to delete webhook due to startup failure...") | |
await ptb_app.bot.delete_webhook(drop_pending_updates=True) | |
logger.info("Webhook deleted during failed startup cleanup.") | |
except Exception as del_wh_err: logger.error(f"Failed to delete webhook during failed startup cleanup: {del_wh_err}") | |
try: await ptb_app.shutdown() | |
except Exception as shutdown_err: logger.error(f"Error shutting down PTB app during failed startup: {shutdown_err}") | |
raise # Re-raise the original startup error | |
finally: | |
# --- Shutdown Logic --- | |
logger.info("ASGI Lifespan: Shutdown initiated...") | |
if ptb_app and bot_setup_successful: | |
# Stop PTB app first | |
if ptb_app.running: | |
logger.info("Stopping PTB Application processing...") | |
try: await ptb_app.stop() | |
except Exception as e: logger.error(f"Error stopping PTB application: {e}") | |
else: logger.info("PTB Application was not running.") | |
# Delete webhook before shutting down fully | |
try: | |
logger.info("Attempting to delete webhook on shutdown...") | |
if ptb_app.bot and hasattr(ptb_app.bot, 'delete_webhook'): | |
# Check if webhook is actually set before trying to delete | |
current_wh_info = await ptb_app.bot.get_webhook_info() | |
if current_wh_info and current_wh_info.url: | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): | |
logger.info("Webhook deleted successfully on shutdown.") | |
else: | |
logger.warning("Failed to delete webhook on shutdown (API returned False).") | |
else: | |
logger.info("No webhook was set, skipping deletion.") | |
else: | |
logger.warning("Cannot delete webhook: Bot object unavailable or doesn't support delete_webhook.") | |
except Exception as e: | |
logger.warning(f"Could not delete webhook during shutdown: {e}", exc_info=False) | |
# Shutdown PTB application resources | |
logger.info("Shutting down PTB Application resources...") | |
try: await ptb_app.shutdown() | |
except Exception as e: logger.error(f"Error during PTB application shutdown: {e}") | |
logger.info("PTB Application shut down.") | |
else: | |
logger.info("PTB app not fully initialized or setup failed. Skipping PTB shutdown steps.") | |
logger.info("ASGI Lifespan: Shutdown complete.") | |
async def health_check(request: Request) -> PlainTextResponse: | |
"""Simple health check endpoint.""" | |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled | |
global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled | |
bot_status = "Not Initialized"; bot_username = "N/A" | |
if ptb_app and ptb_app.bot: | |
try: | |
# Check if the application is running (processing updates) | |
app_running = ptb_app.running | |
# Try to get bot info regardless of running state if bot object exists | |
bot_info = await ptb_app.bot.get_me() | |
bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error" | |
if app_running: | |
bot_status = "Running" | |
else: | |
# If initialized but not running (e.g., during startup/shutdown) | |
bot_status = "Initialized (Not Processing Updates)" | |
except (TimedOut, NetworkError) as net_err: | |
bot_status = f"Network Error checking status: {type(net_err).__name__}" | |
bot_username = "N/A (Network Error)" | |
logger.warning(f"Health check: Network error getting bot info: {net_err}") | |
except Exception as e: | |
bot_status = f"Error checking status: {type(e).__name__}" | |
bot_username = "N/A (Error)" | |
logger.warning(f"Health check: Error getting bot info: {e}", exc_info=False) | |
elif ptb_app: | |
bot_status = "Initialized (Bot object missing?)" | |
bot_username = "N/A" | |
else: | |
bot_status = "Not Initialized" | |
bot_username = "N/A" | |
# Construct the response string | |
response_lines = [ | |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})", | |
"--- Summarization ---", | |
f"Primary Model (Gemini): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}", | |
f"Fallback Model (OpenRouter): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}", | |
"--- YouTube Transcripts ---", | |
"Primary (Lib): Enabled", | |
f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled (Key Missing)'}", | |
f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED (Token Missing)'}", | |
"--- Website Scraping ---", | |
f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED (Library/Driver Missing?)'}", | |
"Fallback 1 (Direct+BS4): Enabled", | |
f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled (Key Missing)'}", | |
f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled (Key Missing)'}", | |
f"Fallback 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled (Token Missing)'}" | |
] | |
return PlainTextResponse("\n".join(response_lines)) | |
async def telegram_webhook(request: Request) -> Response: | |
"""Handles incoming updates from Telegram.""" | |
global ptb_app, WEBHOOK_SECRET # Ensure ptb_app is accessible | |
# --- Basic Checks --- | |
if not ptb_app: | |
logger.error("Webhook received but PTB application is not initialized.") | |
return PlainTextResponse('Bot application not initialized', status_code=503) # Service Unavailable | |
if not ptb_app.bot: | |
logger.error("Webhook received but PTB bot object is not available.") | |
return PlainTextResponse('Bot object not available', status_code=503) | |
if not ptb_app.running: | |
logger.warning("Webhook received but PTB application is not running (likely startup/shutdown).") | |
# Return 200 OK to Telegram to prevent retries, but log the warning. | |
return PlainTextResponse('Bot not actively processing', status_code=200) | |
# --- Security Check (Secret Token) --- | |
if WEBHOOK_SECRET: | |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token") | |
if not token_header: | |
logger.warning("Webhook received request MISSING secret token header, but one is configured.") | |
return Response(content="Forbidden: Missing secret token", status_code=403) | |
if token_header != WEBHOOK_SECRET: | |
logger.warning(f"Webhook received INVALID secret token. Header: '{token_header[:5]}...'") | |
return Response(content="Forbidden: Invalid secret token", status_code=403) | |
# If token matches, proceed | |
# --- Process Update --- | |
try: | |
update_data = await request.json() | |
update = Update.de_json(data=update_data, bot=ptb_app.bot) | |
logger.debug(f"Processing update_id: {update.update_id} via webhook") | |
# Use PTB's built-in update processing queue | |
await ptb_app.process_update(update) | |
# Return 200 OK to Telegram quickly after queuing the update | |
return Response(status_code=200) | |
except json.JSONDecodeError: | |
logger.error("Webhook received invalid JSON data.") | |
return PlainTextResponse('Bad Request: Invalid JSON', status_code=400) | |
except Exception as e: | |
# Log the error, but return 200 OK to prevent Telegram from resending the faulty update | |
logger.error(f"Error processing webhook update: {e}", exc_info=True) | |
return Response(status_code=200) | |
# --- Starlette App Definition --- | |
app = Starlette( | |
debug=False, # Set to False for production | |
lifespan=lifespan, | |
routes=[ | |
Route("/", endpoint=health_check, methods=["GET"]), | |
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), | |
] | |
) | |
logger.info("Starlette ASGI application created with health check ('/') and Telegram webhook ('/webhook') routes.") | |
# --- Development Server & Playwright Check --- | |
if __name__ == '__main__': | |
import uvicorn | |
logger.warning("Running in development mode using Uvicorn directly - NOT recommended for production!") | |
# Check for Playwright installation on startup in dev mode | |
playwright_installed = False | |
try: | |
from playwright.async_api import async_playwright | |
playwright_installed = True | |
logger.info("Playwright library found.") | |
# Optional: Add playwright install command here if needed for dev | |
# Consider running `playwright install --with-deps` manually in your dev env | |
except ImportError: | |
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
logger.critical("Playwright library not found. Crawl4AI (Primary Scraper) WILL FAIL.") | |
logger.critical("Install it: pip install playwright") | |
logger.critical("Then install browsers: playwright install --with-deps") | |
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
# Check Crawl4AI explicitly | |
if not _crawl4ai_available: | |
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
logger.critical("Crawl4AI library not found. Primary Scraper WILL BE DISABLED.") | |
logger.critical("Install it: pip install crawl4ai") | |
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
elif not playwright_installed: | |
logger.warning("Crawl4AI is installed, but Playwright is missing. Crawl4AI will likely fail without Playwright drivers.") | |
# Get log level and port from environment or use defaults | |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower() | |
local_port = int(os.environ.get('PORT', 8080)) # Use PORT env var, default 8080 | |
# Run Uvicorn | |
uvicorn.run( | |
"__main__:app", | |
host='0.0.0.0', # Listen on all interfaces | |
port=local_port, | |
log_level=log_level, | |
reload=True # Enable auto-reload for development | |
) |