Spaces:
Running
Running
# main.py (Corrected SyntaxError at line 424 - Now with Gemini 2.0 as primary AND new Apify scraping fallbacks AND Crawl4AI as Primary Scraper) | |
import os | |
import re | |
import logging | |
import asyncio | |
import json | |
import html | |
import contextlib | |
import traceback | |
import urllib.parse # Added for URL encoding | |
from typing import Optional, Dict, Any, Tuple, Union # Added Union | |
# --- Frameworks --- | |
from starlette.applications import Starlette | |
from starlette.routing import Route | |
from starlette.responses import PlainTextResponse, JSONResponse, Response | |
from starlette.requests import Request | |
# --- Telegram Bot --- | |
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot | |
from telegram.ext import ( | |
Application, | |
CommandHandler, | |
MessageHandler, | |
filters, | |
ContextTypes, | |
CallbackQueryHandler, | |
) | |
from telegram.constants import ParseMode | |
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError | |
from telegram.request import HTTPXRequest, BaseRequest | |
# --- Other Libraries --- | |
import httpx | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
from bs4 import BeautifulSoup | |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log | |
try: | |
import lxml | |
DEFAULT_PARSER = 'lxml' | |
except ImportError: | |
DEFAULT_PARSER = 'html.parser' | |
# --- Google Gemini --- | |
try: | |
import google.generativeai as genai | |
from google.generativeai.types import HarmCategory, HarmBlockThreshold | |
_gemini_available = True | |
except ImportError: | |
genai = None | |
HarmCategory = None | |
HarmBlockThreshold = None | |
_gemini_available = False | |
# logger will be defined later, log warning after logger setup | |
# --- Crawl4AI (NEW Primary Scraper) --- | |
try: | |
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode, CrawlResult | |
from crawl4ai.models import MarkdownGenerationResult # Specific import for type hint | |
_crawl4ai_available = True | |
except ImportError: | |
AsyncWebCrawler = None | |
CrawlerRunConfig = None | |
BrowserConfig = None | |
CacheMode = None | |
CrawlResult = None | |
MarkdownGenerationResult = None | |
_crawl4ai_available = False | |
# logger will be defined later, log warning after logger setup | |
# --- Logging Setup --- | |
logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO ) | |
logging.getLogger("httpx").setLevel(logging.WARNING) | |
logging.getLogger("telegram.ext").setLevel(logging.INFO) | |
logging.getLogger('telegram.bot').setLevel(logging.INFO) | |
logging.getLogger("urllib3").setLevel(logging.INFO) | |
logging.getLogger('gunicorn.error').setLevel(logging.INFO) | |
logging.getLogger('uvicorn').setLevel(logging.INFO) | |
logging.getLogger('starlette').setLevel(logging.INFO) | |
if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING) | |
# Crawl4AI logger (optional, adjust level as needed) | |
if _crawl4ai_available: logging.getLogger("crawl4ai").setLevel(logging.WARNING) # Keep logs cleaner unless debugging C4AI | |
logger = logging.getLogger(__name__) | |
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}") | |
if not _gemini_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.") | |
if not _crawl4ai_available: logger.warning("crawl4ai library not found. Primary Web Scraping (Crawl4AI) disabled.") | |
# --- Global variable for PTB app --- | |
ptb_app: Optional[Application] = None | |
# --- Environment Variable Loading & Configuration --- | |
logger.info("Attempting to load secrets and configuration...") | |
def get_secret(secret_name): | |
value = os.environ.get(secret_name) | |
if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)") | |
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}") | |
return value | |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN') | |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Summarizer Fallback | |
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Scrape Fallback 2 (WAS 1) | |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # YT Fallback 1 | |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # YT Fallback 2 + Scrape Fallbacks 5 & 6 (WAS 4 & 5) | |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY') # Scrape Fallbacks 3 & 4 (WAS 2 & 3) | |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET') | |
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer | |
# Models (User can still configure via env vars) | |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model | |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # Default YT Actor | |
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Model | |
# Specific Actor IDs for Website Scraping Fallbacks | |
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Fallback 5 (WAS 4) | |
APIFY_TEXT_SCRAPER_ACTOR_ID = "karamelo/text-scraper-free" # Fallback 6 (WAS 5) | |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.") | |
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.") | |
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.") | |
if not RAPIDAPI_KEY: logger.warning("⚠️ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks (3 & 4) will be unavailable.") # Updated numbers | |
if not APIFY_API_TOKEN: logger.warning("⚠️ WARNING: APIFY_API_TOKEN not found. YT transcript fallback (2) and Website scraping fallbacks (5 & 6) will be unavailable.") # Updated numbers | |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY) | |
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.") | |
elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY not found or empty. Gemini disabled.") | |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY) | |
if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.") | |
_crawl4ai_primary_scrape_enabled = _crawl4ai_available # Check if library loaded | |
if not _crawl4ai_available: logger.error("❌ ERROR: crawl4ai library missing. Primary web scraping disabled. Will attempt fallbacks immediately.") | |
if not URLTOTEXT_API_KEY: logger.warning("Optional secret 'URLTOTEXT_API_KEY' not found. Web scraping fallback 2 unavailable.") # Updated number | |
if not SUPADATA_API_KEY: logger.warning("Optional secret 'SUPADATA_API_KEY' not found. YT transcript fallback 1 unavailable.") | |
# APIFY_API_TOKEN warning handled above | |
# RAPIDAPI_KEY warning handled above | |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.") | |
logger.info("Secret loading and configuration check finished.") | |
logger.info(f"Primary Web Scraper (Crawl4AI): {'ENABLED' if _crawl4ai_primary_scrape_enabled else 'DISABLED'}") | |
logger.info(f"Using Gemini Model (Primary Summarizer): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}") | |
logger.info(f"Using OpenRouter Model (Fallback Summarizer): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}") | |
logger.info(f"Using Apify Actor (YT Default): {APIFY_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_CRAWLER_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 6): {APIFY_TEXT_SCRAPER_ACTOR_ID}") | |
_apify_token_exists = bool(APIFY_API_TOKEN) | |
_urltotext_key_exists = bool(URLTOTEXT_API_KEY) | |
_rapidapi_key_exists = bool(RAPIDAPI_KEY) | |
if _gemini_primary_enabled: | |
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.") | |
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_primary_enabled = False | |
# --- Retry Decorator (Unchanged) --- | |
async def retry_bot_operation(func, *args, **kwargs): | |
try: return await func(*args, **kwargs) | |
except BadRequest as e: | |
ignore_errors = [ "message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user", ] | |
if any(err in str(e).lower() for err in ignore_errors): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None | |
logger.error(f"Potentially critical BadRequest: {e}"); raise | |
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise | |
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise | |
# --- Helper Functions (Unchanged) --- | |
def is_youtube_url(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match) | |
def extract_youtube_id(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url) | |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id | |
else: logger.warning(f"Could not extract YT ID from {url}"); return None | |
# --- Content Fetching Functions --- | |
# --- YouTube Transcript Fetching (Unchanged) --- | |
# ... (get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript remain the same) ... | |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]: | |
if not video_id: logger.error("[Supadata] No video_id provided"); return None | |
if not api_key: logger.error("[Supadata] API key missing."); return None | |
logger.info(f"[YT Fallback 1] Attempting fetch for video ID: {video_id} via Supadata") | |
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript" | |
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key} | |
try: | |
async with httpx.AsyncClient(timeout=30.0) as client: | |
response = await client.get(api_endpoint, headers=headers, params=params) | |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}") | |
if response.status_code == 200: | |
try: | |
data = response.json() if response.text else None # Check if text exists before json decode | |
content = None | |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data") | |
if not content and response.text: content = response.text # Fallback to raw text if json parse fails or content key missing | |
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None | |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None | |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None | |
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None | |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None | |
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None | |
except httpx.RequestError as e: | |
if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}") | |
else: logger.error(f"[Supadata] Request error for {video_id}: {e}") | |
return None | |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None | |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]: | |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor.""" | |
global APIFY_ACTOR_ID # Uses the default YT actor ID | |
if not video_url: logger.error("[Apify YT] No video_url provided"); return None | |
if not api_token: logger.error("[Apify YT] API token missing."); return None | |
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items" | |
params = {"token": api_token} | |
# Input specific to karamelo~youtube-transcripts actor | |
payload = { | |
"urls": [video_url], | |
"outputFormat": "singleStringText", | |
"maxRetries": 5, | |
"channelHandleBoolean": False, | |
"channelNameBoolean": False, | |
"datePublishedBoolean": False, | |
"relativeDateTextBoolean": False, | |
} | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=120.0) as client: # Long timeout for potential YT processing | |
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}") | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload) | |
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0] | |
content = None | |
# Check common keys for transcript text | |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"] | |
elif "text" in item and isinstance(item["text"], str): content = item["text"] | |
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"] | |
elif "captions" in item and isinstance(item["captions"], list): # Handle list format if needed | |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text")) | |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"]) | |
if content and isinstance(content, str): logger.info(f"[Apify YT] Success via REST for {video_url}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Apify YT] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None | |
else: logger.warning(f"[Apify YT] Actor success but dataset was empty for {video_url}. Response: {results}"); return None | |
except json.JSONDecodeError: logger.error(f"[Apify YT] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None | |
except Exception as e: logger.error(f"[Apify YT] Error processing success response for {video_url}: {e}", exc_info=True); return None | |
elif response.status_code == 400: logger.error(f"[Apify YT] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None | |
elif response.status_code == 401: logger.error("[Apify YT] Auth error (401). Check token."); return None | |
elif response.status_code == 404: logger.error(f"[Apify YT] Endpoint/Actor Not Found (404). Actor: {APIFY_ACTOR_ID} Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Apify YT] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException as e: logger.error(f"[Apify YT] Timeout during API interaction for {video_url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Apify YT] HTTP Status Error during API interaction for {video_url}: {e}"); return None | |
except httpx.RequestError as e: logger.error(f"[Apify YT] Request error during API interaction for {video_url}: {e}"); return None | |
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None | |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]: | |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists # Added _apify_token_exists global ref | |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None | |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})") | |
transcript_text = None | |
# Method 1: youtube-transcript-api (Primary) | |
logger.info("[Primary YT] Attempting youtube-transcript-api...") | |
try: | |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] ) | |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item]) | |
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text | |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None | |
except NoTranscriptFound: logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.") | |
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.") | |
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None | |
# Method 2: Supadata (Fallback 1) | |
if transcript_text is None: | |
logger.info("[Fallback YT 1] Trying Supadata API...") | |
if SUPADATA_API_KEY: | |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY) | |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.") | |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.") | |
# Method 3: Apify (Fallback 2 - Default YT Actor) | |
if transcript_text is None: | |
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...") | |
if _apify_token_exists: # Use the global flag | |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN) | |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.") | |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.") | |
# Final Result | |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None | |
return transcript_text | |
# --- Website Content Fetching (MODIFIED SECTION) --- | |
# --- NEW Primary Method: Crawl4AI --- | |
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]: | |
"""Primary Web Method: Fetches and extracts content using Crawl4AI.""" | |
global _crawl4ai_primary_scrape_enabled | |
if not _crawl4ai_primary_scrape_enabled: | |
logger.warning("[Crawl4AI Primary] Called but library is unavailable.") | |
return None | |
if not url: logger.error("[Crawl4AI Primary] No URL provided"); return None | |
logger.info(f"[Crawl4AI Primary] Attempting fetch and extraction for: {url}") | |
# Basic Configuration for the crawl run | |
# Prioritize 'fit_markdown' for cleaner text suitable for LLMs | |
run_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, # Avoid caching issues during testing/initial use | |
page_timeout=60000, # 60 second timeout for page load/JS | |
verbose=False, # Keep logs cleaner unless debugging C4AI specifically | |
# We rely on the default markdown generator which should produce result.markdown object | |
# Optional: Could add word_count_threshold if needed later | |
# Optional: Could add wait_for if specific elements need to load | |
) | |
# Basic Browser Config (can be customized further if needed) | |
# Using defaults: headless=True, browser_type='chromium' | |
# browser_config = BrowserConfig(headless=True, verbose=False) | |
extracted_text: Optional[str] = None | |
try: | |
# Use async context manager for proper resource handling | |
# Pass browser_config here if defined: async with AsyncWebCrawler(config=browser_config) as crawler: | |
async with AsyncWebCrawler() as crawler: | |
logger.debug(f"[Crawl4AI Primary] Calling crawler.arun for {url}") | |
result: CrawlResult = await crawler.arun(url=url, config=run_config) | |
logger.debug(f"[Crawl4AI Primary] arun completed for {url}. Success: {result.success}, Status: {result.status_code}") | |
if result.success: | |
# Check for markdown result object first | |
if result.markdown and isinstance(result.markdown, MarkdownGenerationResult): | |
# Prioritize fit_markdown | |
if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str): | |
extracted_text = result.markdown.fit_markdown.strip() | |
logger.debug(f"[Crawl4AI Primary] Using fit_markdown for {url}") | |
# Fallback to raw_markdown if fit_markdown is empty/missing | |
elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str): | |
extracted_text = result.markdown.raw_markdown.strip() | |
logger.debug(f"[Crawl4AI Primary] Using raw_markdown (fit_markdown unavailable) for {url}") | |
else: | |
logger.warning(f"[Crawl4AI Primary] Crawl success, but markdown object contains no usable text content for {url}") | |
# Fallback check (less likely with newer versions, but safe) | |
elif result.markdown and isinstance(result.markdown, str): | |
extracted_text = result.markdown.strip() | |
logger.debug(f"[Crawl4AI Primary] Using direct result.markdown string for {url}") | |
elif result.cleaned_html: # Absolute fallback: Parse cleaned_html with BS4 if no markdown | |
logger.warning(f"[Crawl4AI Primary] No markdown found, attempting to parse cleaned_html with BS4 for {url}") | |
try: | |
soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER) | |
extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
except Exception as bs_err: | |
logger.error(f"[Crawl4AI Primary] Error parsing cleaned_html with BS4 for {url}: {bs_err}") | |
extracted_text = None | |
else: | |
logger.warning(f"[Crawl4AI Primary] Crawl success but no markdown or cleaned_html found for {url}") | |
# Basic length check | |
if extracted_text and len(extracted_text) > 30: | |
logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}") | |
return extracted_text | |
else: | |
content_len = len(extracted_text) if extracted_text else 0 | |
logger.warning(f"[Crawl4AI Primary] Crawl4AI success but extracted text too short or empty for {url}. Length: {content_len}") | |
return None | |
else: | |
# Crawl failed | |
error_msg = result.error_message or f"Unknown error (status code: {result.status_code})" | |
logger.error(f"[Crawl4AI Primary] Crawl failed for {url}. Error: {error_msg}") | |
return None | |
except asyncio.TimeoutError: | |
logger.error(f"[Crawl4AI Primary] Timeout error during crawl for {url}") | |
return None | |
except Exception as e: | |
logger.error(f"[Crawl4AI Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True) | |
return None | |
# --- Fallback 1: Direct Fetch + BS4 (Previously Primary) --- | |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]: | |
"""Directly fetches URL content using httpx. (Fallback Web Method 1 - Fetching part)""" | |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' } | |
try: | |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client: | |
logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}") | |
response = await client.get(url) | |
logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}") | |
response.raise_for_status() # Raise HTTPStatusError for 4xx/5xx | |
content_type = response.headers.get('content-type', '').lower() | |
if 'html' not in content_type and 'xml' not in content_type: # Allow xml just in case | |
logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}") | |
# Allow processing if text/plain, might be simple text page | |
if 'text/plain' in content_type: | |
logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, attempting to read.") | |
return response.text # Return plain text directly | |
return None # Skip other non-html types | |
try: | |
return response.text # Attempt to decode text, handle potential errors | |
except Exception as e: | |
logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}") | |
return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}") | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}") | |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}") | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 1] Request error fetching {url}: {e}") | |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error fetching {url}: {e}", exc_info=True) | |
return None | |
async def get_website_content_direct_bs4(url: str) -> Optional[str]: | |
"""Fallback 1: Fetches HTML directly and parses with BeautifulSoup.""" | |
if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None | |
logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}") | |
html_content = await fetch_url_content_for_scrape(url) | |
if not html_content: | |
logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}.") | |
return None | |
try: | |
# --- Parsing logic (run in thread to avoid blocking) --- | |
def parse_html(content: str) -> Optional[str]: | |
try: | |
soup = BeautifulSoup(content, DEFAULT_PARSER) | |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]): | |
element.extract() | |
main_content = soup.find('main') or \ | |
soup.find('article') or \ | |
soup.find(role='main') or \ | |
soup.find(id=re.compile(r'content|main|body', re.I)) or \ | |
soup.find(class_=re.compile(r'content|main|body|article|post', re.I)) | |
target_element = main_content if main_content else soup.body | |
if not target_element: | |
logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content container for {url}") | |
# Try getting text from root if no body/main | |
text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
if text_from_root and len(text_from_root) > 50: | |
logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}.") | |
return text_from_root | |
return None | |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()] | |
text = " ".join(lines) | |
if not text or len(text) < 50: # Adjust threshold as needed | |
logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text seems too short or empty after cleaning for {url}. Length: {len(text)}") | |
return None | |
return text | |
except Exception as parse_e: | |
logger.error(f"[Web Scrape Fallback 1 Parse] Error during BeautifulSoup parsing for {url}: {parse_e}", exc_info=False) # Keep log cleaner | |
return None | |
# --- End parsing logic --- | |
text_content = await asyncio.to_thread(parse_html, html_content) | |
if text_content: | |
logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (final len: {len(text_content)})") | |
return text_content | |
else: | |
logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no content for {url}.") | |
return None | |
except Exception as e: | |
logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing process for {url}: {e}", exc_info=True) | |
return None | |
# --- Fallback 2: urltotext.com API --- | |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 2: Fetches website content using urltotext.com API.""" | |
if not url: logger.error("[Web Scrape Fallback 2] No URL"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 2] urltotext.com API key missing."); return None | |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using urltotext.com API") | |
# ... (rest of the function remains the same, just update log identifiers) | |
api_endpoint = "https://urltotext.com/api/v1/urltotext/" | |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False } | |
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" } | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[Web Scrape Fallback 2] Sending request to urltotext.com API for {url}") | |
response = await client.post(api_endpoint, headers=headers, json=payload) | |
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from urltotext.com API for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning") | |
if warning: logger.warning(f"[Web Scrape Fallback 2] urltotext.com API Warning for {url}: {warning}") | |
if content and isinstance(content, str): logger.info(f"[Web Scrape Fallback 2] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip() | |
else: logger.warning(f"[Web Scrape Fallback 2] urltotext.com API success but content empty for {url}. Resp: {data}"); return None | |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Web Scrape Fallback 2] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 2] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 2] Timeout connecting to urltotext.com API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 2] Request error connecting to urltotext.com API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None | |
# --- Fallback 3: Scraper's Proxy Parser via RapidAPI --- | |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 3: Fetches website content using Scraper's Proxy Parser via RapidAPI.""" | |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API") | |
# ... (rest of the function remains the same, just update log identifiers) | |
api_host = "scrapers-proxy2.p.rapidapi.com" | |
encoded_url = urllib.parse.quote(url, safe='') # URL Encode the target URL | |
api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true" | |
headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" } | |
try: | |
async with httpx.AsyncClient(timeout=40.0) as client: | |
logger.debug(f"[Web Scrape Fallback 3] Sending GET request to {api_host} for {url}") | |
response = await client.get(api_endpoint, headers=headers) | |
logger.debug(f"[Web Scrape Fallback 3] Received status {response.status_code} from {api_host} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("content"); title = data.get("title"); extracted_text = "" | |
if title and isinstance(title, str): extracted_text += title.strip() + ". " | |
if content and isinstance(content, str): extracted_text += content.strip() | |
if extracted_text and len(extracted_text) > 30: | |
logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy Parser API for {url}. Len: {len(extracted_text)}") | |
return extracted_text | |
else: | |
logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title seems empty or too short for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}") | |
return None | |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check API subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 3] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 3] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
# --- Fallback 4: AI Web Scraper via RapidAPI --- | |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 4: Fetches website content using AI Web Scraper via RapidAPI.""" | |
if not url: logger.error("[Web Scrape Fallback 4] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 4] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 4] Attempting fetch for: {url} using AI Web Scraper API") | |
# ... (rest of the function remains the same, just update log identifiers) | |
api_host = "ai-web-scraper.p.rapidapi.com"; api_endpoint = f"https://{api_host}/extract_content/v1" | |
headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'x-rapidapi-host': api_host, 'x-rapidapi-key': api_key } | |
payload = {'url': url} | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[Web Scrape Fallback 4] Sending POST request to {api_host} for {url}") | |
response = await client.post(api_endpoint, headers=headers, data=payload) | |
logger.debug(f"[Web Scrape Fallback 4] Received status {response.status_code} from {api_host} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json(); content = None | |
if isinstance(data, dict): content = data.get("content") or data.get("text") or data.get("extracted_text") or data.get("result") | |
elif isinstance(data, str): content = data | |
if content and isinstance(content, str) and len(content) > 30: logger.info(f"[Web Scrape Fallback 4] Success via AI Web Scraper API for {url}. Len: {len(content)}"); return content.strip() | |
else: | |
keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else f"Type: {type(data)}"; content_len = len(content) if content and isinstance(content, str) else 0 | |
logger.warning(f"[Web Scrape Fallback 4] AI Web Scraper API success but content empty/short/invalid format for {url}. {keys_info}. Length: {content_len}") | |
return None | |
except json.JSONDecodeError: | |
raw_text = response.text | |
if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text content. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}"); return raw_text.strip() | |
else: logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check API subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 4] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 4] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
# --- Fallback 5 & 6: Apify Website Scraping --- | |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str, fallback_num: int) -> Optional[str]: | |
"""Generic function to run an Apify actor and get text content.""" | |
if not url: logger.error(f"[{actor_name} - FB{fallback_num}] No URL provided"); return None | |
if not api_token: logger.error(f"[{actor_name} - FB{fallback_num}] API token missing."); return None | |
logger.info(f"[{actor_name} - FB{fallback_num}] Attempting fetch for URL: {url} (Actor: {actor_id})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"; params = {"token": api_token} | |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" } | |
if actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID: | |
run_input = { "urls": [url] }; logger.debug(f"[{actor_name} - FB{fallback_num}] Using simplified input: {run_input}") | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=120.0) as client: | |
logger.debug(f"[{actor_name} - FB{fallback_num}] POST Request to {sync_items_endpoint} for {url}") | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input) | |
logger.debug(f"[{actor_name} - FB{fallback_num}] Received status code {response.status_code} for {url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0]; content = None | |
if "text" in item and isinstance(item["text"], str): content = item["text"] | |
elif "content" in item and isinstance(item["content"], str): content = item["content"] | |
elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"] | |
elif "html" in item and isinstance(item["html"], str): | |
logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text' or 'markdown' found, attempting to parse 'html'.") | |
soup = BeautifulSoup(item["html"], DEFAULT_PARSER) | |
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
if content and isinstance(content, str) and len(content) > 30: | |
logger.info(f"[{actor_name} - FB{fallback_num}] Success via REST for {url}. Length: {len(content)}") | |
return content.strip() | |
else: | |
content_len = len(content) if content and isinstance(content, str) else 0 | |
logger.warning(f"[{actor_name} - FB{fallback_num}] Dataset item parsed but text content empty/short/invalid format for {url}. Item keys: {list(item.keys())}. Length: {content_len}") | |
return None | |
else: logger.warning(f"[{actor_name} - FB{fallback_num}] Actor success but dataset was empty for {url}. Response: {results}"); return None | |
except json.JSONDecodeError: logger.error(f"[{actor_name} - FB{fallback_num}] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None | |
except Exception as e: logger.error(f"[{actor_name} - FB{fallback_num}] Error processing success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code == 400: logger.error(f"[{actor_name} - FB{fallback_num}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None | |
elif response.status_code == 401: logger.error(f"[{actor_name} - FB{fallback_num}] Auth error (401). Check token."); return None | |
elif response.status_code == 404: logger.error(f"[{actor_name} - FB{fallback_num}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[{actor_name} - FB{fallback_num}] Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException as e: logger.error(f"[{actor_name} - FB{fallback_num}] Timeout during API interaction for {url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[{actor_name} - FB{fallback_num}] HTTP Status Error during API interaction for {url}: {e}"); return None | |
except httpx.RequestError as e: logger.error(f"[{actor_name} - FB{fallback_num}] Request error during API interaction for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[{actor_name} - FB{fallback_num}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 5: Fetches website content using Apify Website Content Crawler.""" | |
return await _run_apify_actor_for_web_content( | |
url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, | |
actor_name="Apify Crawler", fallback_num=5 | |
) | |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 6: Fetches website content using Apify Text Scraper Free.""" | |
return await _run_apify_actor_for_web_content( | |
url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, | |
actor_name="Apify Text Scraper", fallback_num=6 | |
) | |
# --- Summarization Functions (Unchanged) --- | |
# ... (_call_gemini, _call_openrouter, generate_summary remain the same) ... | |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call Gemini API. Returns (summary, error_message).""" | |
global GEMINI_MODEL, _gemini_primary_enabled | |
if not _gemini_primary_enabled: | |
logger.error("[Gemini Primary] Called but is disabled."); | |
return None, "Error: Primary AI service (Gemini) not configured/available." | |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}") | |
# Define prompts (same as before) | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
MAX_INPUT_LENGTH_GEMINI = 900000 # Check model docs for actual limit | |
if len(text) > MAX_INPUT_LENGTH_GEMINI: | |
logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
safety_settings = { HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, } | |
if hasattr(HarmCategory, 'HARM_CATEGORY_CIVIC_INTEGRITY'): safety_settings[HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY] = HarmBlockThreshold.BLOCK_NONE | |
logger.debug(f"[Gemini Primary] Using safety settings: { {k.name: v.name for k, v in safety_settings.items()} }") | |
try: | |
logger.debug(f"[Gemini Primary] Initializing model {GEMINI_MODEL}") | |
model = genai.GenerativeModel(GEMINI_MODEL) | |
logger.info(f"[Gemini Primary] Sending request to Gemini ({GEMINI_MODEL})...") | |
request_options = {"timeout": 120} | |
response = await model.generate_content_async( full_prompt, generation_config=genai.types.GenerationConfig(), safety_settings=safety_settings, request_options=request_options ) | |
logger.info("[Gemini Primary] Received response from Gemini.") | |
if response.prompt_feedback and response.prompt_feedback.block_reason: | |
block_reason_str = getattr(response.prompt_feedback.block_reason, 'name', str(response.prompt_feedback.block_reason)) | |
logger.warning(f"[Gemini Primary] Request blocked by API. Reason: {block_reason_str}"); | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the request (Reason: {block_reason_str})." | |
summary = None; finish_reason_str = 'UNKNOWN' | |
if response.candidates: | |
candidate = response.candidates[0] | |
finish_reason_name = getattr(candidate.finish_reason, 'name', None); finish_reason_str = finish_reason_name or 'N/A' | |
if finish_reason_name == 'SAFETY': | |
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in candidate.safety_ratings]) | |
logger.warning(f"[Gemini Primary] Candidate blocked due to SAFETY. Finish Reason: {finish_reason_str}. Ratings: [{safety_ratings_str}]") | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the response due to safety filters ({finish_reason_str})." | |
elif finish_reason_name not in ['STOP', 'MAX_TOKENS', None]: logger.warning(f"[Gemini Primary] Candidate finished with non-standard reason: {finish_reason_str}") | |
if candidate.content and candidate.content.parts: summary = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text')) | |
if summary is None: | |
try: summary = response.text | |
except ValueError as e: logger.warning(f"[Gemini Primary] Error accessing response.text (likely blocked): {e}"); summary = None | |
if summary: | |
logger.info(f"[Gemini Primary] Success generating summary. Finish Reason: {finish_reason_str}. Output len: {len(summary)}"); | |
return summary.strip(), None | |
else: | |
logger.warning(f"[Gemini Primary] Gemini returned empty summary or content was blocked. Final Finish Reason: {finish_reason_str}"); | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) did not provide a summary (Finish Reason: {finish_reason_str})." | |
except AttributeError as ae: logger.error(f"[Gemini Primary] AttributeError during Gemini response processing: {ae}. SDK might be incompatible.", exc_info=True); return None, f"Sorry, error processing response from the primary AI ({GEMINI_MODEL})." | |
except Exception as e: logger.error(f"[Gemini Primary] Unexpected error during Gemini API call: {e}", exc_info=True); return None, f"Sorry, unexpected error using primary AI ({GEMINI_MODEL})." | |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call OpenRouter API (Fallback). Returns (summary, error_message).""" | |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled | |
if not _openrouter_fallback_enabled: | |
logger.error("[OpenRouter Fallback] Called but is disabled."); | |
return None, "Error: Fallback AI service (OpenRouter) not configured/available." | |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}") | |
# Define prompts (same as Gemini) | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
MAX_INPUT_LENGTH_OR = 100000 # Conservative limit | |
if len(text) > MAX_INPUT_LENGTH_OR: | |
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json", "HTTP-Referer": "https://github.com/your-repo", "X-Title": "TelegramSummariserBot" } | |
payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] } | |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions" | |
api_timeouts = httpx.Timeout(connect=10.0, read=45.0, write=10.0, pool=60.0); response = None | |
try: | |
async with httpx.AsyncClient(timeout=api_timeouts) as client: | |
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_MODEL})...") | |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload) | |
logger.info(f"[OpenRouter Fallback] Received response. Status: {response.status_code}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0: | |
choice = data["choices"][0]; message = choice.get("message"); finish_reason = choice.get("finish_reason", "N/A") | |
if message and isinstance(message, dict): | |
summary = message.get("content") | |
if summary: logger.info(f"[OpenRouter Fallback] Success. Finish: {finish_reason}. Output len: {len(summary)}"); return summary.strip(), None | |
else: logger.warning(f"[OpenRouter Fallback] Success but content empty. Finish: {finish_reason}. Resp: {data}"); return None, f"Fallback AI ({OPENROUTER_MODEL}) returned empty summary (Finish: {finish_reason})." | |
else: logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Finish: {finish_reason}. Full: {data}"); return None, "Could not parse fallback AI response (message format)." | |
else: | |
error_details = data.get("error", {}); logger.error(f"[OpenRouter Fallback] Unexpected choices structure or error: {data.get('choices')}. Error: {error_details}. Full: {data}") | |
return None, f"Could not parse fallback AI response (choices/error: {error_details.get('message', 'Unknown')})." | |
except json.JSONDecodeError: logger.error(f"[OpenRouter Fallback] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:500]}"); return None, "Failed to understand fallback AI response." | |
except Exception as e: logger.error(f"[OpenRouter Fallback] Error processing success response: {e}", exc_info=True); return None, "Error processing fallback AI response." | |
elif response.status_code == 401: logger.error("[OpenRouter Fallback] API key invalid (401)."); return None, "Fallback AI config key invalid." | |
elif response.status_code == 402: logger.error("[OpenRouter Fallback] Payment Required/Quota Exceeded (402)."); return None, f"Fallback AI ({OPENROUTER_MODEL}) quota/limit issue." | |
elif response.status_code == 429: logger.warning("[OpenRouter Fallback] Rate Limit Exceeded (429)."); return None, f"Fallback AI ({OPENROUTER_MODEL}) is busy. Try again." | |
elif response.status_code == 500: logger.error(f"[OpenRouter Fallback] Internal Server Error (500). Resp:{response.text[:500]}"); return None, f"Fallback AI ({OPENROUTER_MODEL}) had internal error." | |
else: | |
error_info = ""; | |
try: error_info = response.json().get("error", {}).get("message", "") | |
except Exception: pass | |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}"); | |
return None, f"Fallback AI ({OPENROUTER_MODEL}) returned unexpected status ({response.status_code})." | |
except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) from OpenRouter API: {e}"); return None, f"Fallback AI ({OPENROUTER_MODEL}) timed out." | |
except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Error connecting to fallback AI service." | |
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Unexpected error using fallback AI service." | |
async def generate_summary(text: str, summary_type: str) -> str: | |
"""Generates summary using Gemini (Primary) and falls back to OpenRouter if needed.""" | |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL | |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})") | |
final_summary: Optional[str] = None; primary_error_message: Optional[str] = None | |
if _gemini_primary_enabled: | |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})") | |
final_summary, primary_error_message = await _call_gemini(text, summary_type) | |
if final_summary: logger.info("[Summary Generation] Success with primary AI (Gemini)."); return final_summary | |
else: logger.warning(f"[Summary Generation] Primary AI (Gemini) failed. Error: {primary_error_message}. Proceeding to fallback.") | |
else: logger.warning("[Summary Generation] Primary AI (Gemini) disabled. Proceeding to fallback."); primary_error_message = "Primary AI (Gemini) unavailable." | |
if _openrouter_fallback_enabled: | |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})") | |
fallback_summary, fallback_error_message = await _call_openrouter(text, summary_type) | |
if fallback_summary: logger.info("[Summary Generation] Success with fallback AI (OpenRouter)."); return fallback_summary | |
else: | |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error_message}") | |
if primary_error_message and "unavailable" not in primary_error_message.lower(): return f"{primary_error_message} Fallback AI ({OPENROUTER_MODEL}) also failed: {fallback_error_message}" | |
else: return f"Sorry, summarization failed. Primary AI: {primary_error_message or 'N/A'}. Fallback AI ({OPENROUTER_MODEL}): {fallback_error_message}" | |
else: | |
logger.error("[Summary Generation] Fallback AI (OpenRouter) disabled. Cannot proceed.") | |
if primary_error_message: return f"{primary_error_message} Fallback AI is also unavailable." | |
else: return "Error: Both primary and fallback AI services are unavailable." | |
logger.error("[Summary Generation] Reached end of function unexpectedly.") | |
return "Sorry, unknown error during summary generation." | |
# --- Main Processing Logic (MODIFIED with Crawl4AI and re-ordered fallbacks) --- | |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None: | |
"""Handles the entire process: fetching content (Crawl4AI -> Fallbacks) and summarizing.""" | |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}") | |
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None | |
try: background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 ); bot = Bot(token=bot_token, request=background_request) | |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return | |
content: Optional[str] = None | |
user_feedback_message: Optional[str] = None | |
success: bool = False | |
status_message_id: Optional[int] = message_id_to_edit | |
message_to_delete_later_id : Optional[int] = None | |
try: | |
# --- 1. Initial User Feedback --- | |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... might take a moment)..." # Updated text | |
if status_message_id: | |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'") | |
except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None | |
if not status_message_id: | |
try: | |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN ) | |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}") | |
else: raise RuntimeError("Failed to send status message after retries.") | |
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise | |
try: | |
# --- 2. Content Fetching (Chain of methods) --- | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}") | |
if is_youtube: | |
# --- YouTube Transcript Logic (Unchanged) --- | |
video_id = extract_youtube_id(url) | |
if video_id: content = await get_youtube_transcript(video_id, url) # Tries lib -> Supadata -> Apify YT Actor | |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format." | |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)." | |
else: | |
# --- Website Scraping Logic (NEW Order: Crawl4AI -> Direct+BS4 -> APIs -> Apify) --- | |
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN | |
global _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists | |
global _crawl4ai_primary_scrape_enabled # Ensure global access | |
# Method 0: Primary Scrape (Crawl4AI - NEW) | |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Crawl4AI)...") | |
if _crawl4ai_primary_scrape_enabled: | |
content = await get_website_content_via_crawl4ai(url) | |
if not content: logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed.") | |
else: | |
logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library unavailable.") | |
# Method 1: Fallback 1 (Direct Fetch + BS4 - WAS Primary) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 0 failed. Trying Method 1 (Direct Fetch + BS4)...") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_direct_bs4(url) | |
if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.") | |
# Method 2: Fallback 2 (urltotext.com - WAS Fallback 1) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...") | |
if _urltotext_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.") | |
else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.") | |
# Method 3: Fallback 3 (Scraper's Proxy via RapidAPI - WAS Fallback 2) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_scrapers_proxy(url, RAPIDAPI_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.") | |
else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.") | |
# Method 4: Fallback 4 (AI Web Scraper via RapidAPI - WAS Fallback 3) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_ai_web_scraper(url, RAPIDAPI_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.") | |
else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.") | |
# Method 5: Fallback 5 (Apify Website Content Crawler - WAS Fallback 4) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_crawler(url, APIFY_API_TOKEN) | |
if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.") | |
else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.") | |
# Method 6: Fallback 6 (Apify Text Scraper Free - WAS Fallback 5) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_text_scraper(url, APIFY_API_TOKEN) | |
if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.") | |
else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.") | |
# Final check for website content after all methods | |
if not content and not user_feedback_message: | |
user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed." # Updated message | |
# --- 3. Summarization --- | |
if content: | |
logger.info(f"[Task {task_id}] Content fetched successfully (len:{len(content)}). Generating summary.") | |
# Update status message before summarization | |
try: | |
status_update_msg_id = message_to_delete_later_id or status_message_id | |
if status_update_msg_id: | |
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_update_msg_id, text=f"Content fetched! Now generating '{summary_type}' summary...", parse_mode=ParseMode.MARKDOWN, reply_markup=None ) | |
except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
final_summary = await generate_summary(content, summary_type) # Calls Gemini -> OpenRouter | |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): | |
user_feedback_message = final_summary # Use the error message from summarizer | |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}") | |
else: | |
max_length = 4096 | |
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)] | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} ) | |
for part in summary_parts[1:]: | |
await asyncio.sleep(0.5) # Small delay between parts | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} ) | |
success = True | |
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).") | |
user_feedback_message = None # Clear any previous error message | |
# --- 4. Handle Final Failure Feedback --- | |
if user_feedback_message: # If any step failed and set a message | |
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}") | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} ) | |
except Exception as e: | |
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True) | |
user_feedback_message = "Oops! Something went really wrong during processing. Please try again later." | |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message ) | |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.") | |
except Exception as outer_e: | |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True) | |
try: | |
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred. Could not start processing." ) | |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.") | |
finally: | |
# --- 5. Cleanup --- | |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id | |
if delete_target_id and bot: | |
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id); logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}") | |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}") | |
if background_request and hasattr(background_request, '_client') and background_request._client: | |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.") | |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}") | |
logger.info(f"[Task {task_id}] Task completed. Success: {success}") | |
# --- Telegram Handlers (Unchanged) --- | |
# ... (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler remain the same) ... | |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user; mention = user.mention_html() | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /start.") | |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" ) | |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /help.") | |
help_text = ( "🔍 **How to use this bot:**\n\n" | |
"1. Send me any YouTube video link or website URL.\n" | |
"2. I'll ask how you want it summarised (paragraph or points).\n" | |
"3. Click the button for your choice.\n" | |
"4. Wait while I fetch the content and generate the summary!\n\n" | |
"⚙️ I try multiple methods to get content, especially for tricky websites or YouTube videos without standard transcripts.\n\n" | |
"**Commands:**\n" | |
"`/start` - Display the welcome message\n" | |
"`/help` - Show this help message" ) | |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN) | |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
if not update.message or not update.message.text: return | |
url = update.message.text.strip(); user = update.effective_user | |
if not user: return | |
url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE) | |
if not url_pattern.search(url): | |
logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}") | |
return | |
match = url_pattern.search(url) | |
if match: | |
extracted_url = match.group(0) | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) sent potential URL: {extracted_url}") | |
context.user_data['url_to_summarize'] = extracted_url | |
context.user_data['original_message_id'] = update.message.message_id | |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]] | |
reply_markup = InlineKeyboardMarkup(keyboard) | |
try: | |
await update.message.reply_text( | |
f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?", | |
reply_markup=reply_markup, | |
disable_web_page_preview=True, | |
parse_mode=ParseMode.MARKDOWN | |
) | |
except BadRequest as e: | |
if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).") | |
else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}") | |
except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True) | |
else: logger.debug(f"Ignoring message from {user.id} that passed initial check but no URL found: {url[:100]}") | |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
query = update.callback_query | |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return | |
user = query.from_user; summary_type = query.data; query_id = query.id | |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}") | |
except BadRequest as e: | |
if "query is too old" in str(e).lower(): logger.warning(f"Callback query {query_id} is too old to answer."); return | |
else: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) | |
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) | |
url = context.user_data.get('url_to_summarize'); message_id_to_edit = query.message.message_id | |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}") | |
if not url: | |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Old button?") | |
try: await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.") | |
except BadRequest as e: | |
if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass | |
else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}") | |
except Exception as e: logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}") | |
return | |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}") | |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled | |
if not TELEGRAM_TOKEN: | |
logger.critical("TELEGRAM_TOKEN missing in callback!"); try: await query.edit_message_text(text="❌ Bot config error (Token Missing).") | |
except Exception: pass; return | |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled: | |
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!"); try: await query.edit_message_text(text="❌ AI config error: No summarization models available.") | |
except Exception: pass; return | |
elif not _gemini_primary_enabled: logger.warning("Primary AI (Gemini) unavailable, relying on fallback.") | |
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI (OpenRouter) unavailable, relying on primary.") | |
logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}") | |
asyncio.create_task( | |
process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=message_id_to_edit, url=url, summary_type=summary_type, bot_token=TELEGRAM_TOKEN ), | |
name=f"SummaryTask-{user.id}-{message_id_to_edit}" | |
) | |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Log Errors caused by Updates.""" | |
logger.error("Exception while handling an update:", exc_info=context.error) | |
# --- Application Setup & Web Framework (MODIFIED Health Check) --- | |
async def setup_bot_config() -> Application: | |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN | |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.") | |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 ) | |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build() | |
application.add_handler(CommandHandler("start", start)) | |
application.add_handler(CommandHandler("help", help_command)) | |
url_filter = filters.Entity("url") | filters.Entity("text_link") | |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND & url_filter, handle_potential_url)) | |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback)) | |
application.add_error_handler(error_handler) | |
logger.info("Telegram application handlers configured."); return application | |
async def lifespan(app: Starlette): | |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN | |
logger.info("ASGI Lifespan: Startup initiated..."); | |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.") | |
bot_setup_successful = False | |
try: | |
ptb_app = await setup_bot_config() | |
await ptb_app.initialize() | |
bot_info = await ptb_app.bot.get_me() | |
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})") | |
bot_setup_successful = True | |
current_webhook_info = await ptb_app.bot.get_webhook_info() | |
if current_webhook_info and current_webhook_info.url: | |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete...") | |
try: | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Existing webhook deleted.") | |
else: logger.warning("Failed delete existing webhook (API returned False).") | |
except Exception as e: logger.warning(f"Could not delete existing webhook: {e}. Proceeding."); await asyncio.sleep(1) | |
space_host = os.environ.get("SPACE_HOST"); webhook_path = "/webhook"; full_webhook_url = None | |
if space_host: | |
protocol = "https"; host = space_host.split('://')[-1] | |
full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}" | |
if full_webhook_url: | |
logger.info(f"Setting webhook to: {full_webhook_url}") | |
set_webhook_args = { "url": full_webhook_url, "allowed_updates": Update.ALL_TYPES, "drop_pending_updates": True } | |
if WEBHOOK_SECRET: set_webhook_args["secret_token"] = WEBHOOK_SECRET; logger.info("Webhook secret token configured.") | |
await asyncio.sleep(1.5) | |
try: | |
await ptb_app.bot.set_webhook(**set_webhook_args) | |
webhook_info = await ptb_app.bot.get_webhook_info() | |
if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}") | |
else: logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'.") | |
await ptb_app.start() | |
logger.info("PTB Application started in webhook mode.") | |
except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e | |
else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL could not be determined.") | |
else: logger.critical("SPACE_HOST env var not found."); raise RuntimeError("SPACE_HOST env var missing.") | |
logger.info("ASGI Lifespan: Startup complete."); yield # --- Application runs here --- | |
except Exception as startup_err: | |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True) | |
if ptb_app and bot_setup_successful: | |
if ptb_app.running: await ptb_app.stop() | |
await ptb_app.shutdown() | |
raise | |
finally: | |
logger.info("ASGI Lifespan: Shutdown initiated...") | |
if ptb_app and bot_setup_successful: | |
if ptb_app.running: logger.info("Stopping PTB Application..."); await ptb_app.stop() | |
logger.info("Shutting down PTB Application..."); await ptb_app.shutdown(); logger.info("PTB Application shut down.") | |
try: | |
logger.info("Attempting to delete webhook on shutdown...") | |
if ptb_app.bot and hasattr(ptb_app.bot, 'delete_webhook'): | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted on shutdown.") | |
else: logger.warning("Failed to delete webhook on shutdown (API returned False).") | |
else: logger.warning("Cannot delete webhook: Bot object unavailable.") | |
except Exception as e: logger.warning(f"Could not delete webhook during shutdown: {e}") | |
else: logger.info("PTB app not fully initialized/failed startup. No shutdown needed.") | |
logger.info("ASGI Lifespan: Shutdown complete.") | |
async def health_check(request: Request) -> PlainTextResponse: | |
"""Simple health check endpoint.""" | |
# ADDED _crawl4ai_primary_scrape_enabled | |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled | |
global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled | |
bot_status = "Not Initialized"; bot_username = "N/A" | |
if ptb_app: | |
try: | |
if ptb_app.running: | |
bot_info = await ptb_app.bot.get_me() | |
bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error" | |
bot_status = "Running" | |
else: | |
bot_status = "Initialized but Not Running" | |
if ptb_app.bot: | |
try: bot_info = await ptb_app.bot.get_me(); bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error" | |
except Exception: bot_username = "Info Fetch Error (Not Running)" | |
except TimedOut: bot_status = "Timeout checking status"; logger.warning("Health check: Timeout getting bot info.") | |
except NetworkError as ne: bot_status = f"Network Error: {ne}"; logger.warning(f"Health check: NetworkError: {ne}") | |
except Exception as e: bot_status = f"Error checking: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}") | |
else: bot_status = "Not Initialized"; bot_username = "N/A" | |
# Updated health check output | |
return PlainTextResponse( | |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n" | |
f"--- Summarization ---\n" | |
f"Primary Model (Gemini): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}\n" | |
f"Fallback Model (OpenRouter): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n" | |
f"--- YouTube Transcripts ---\n" | |
f"Primary (Lib): Enabled\n" | |
f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n" | |
f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n" | |
f"--- Website Scraping ---\n" | |
f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Library Missing?'}\n" | |
f"Fallback 1 (Direct+BS4): Enabled\n" | |
f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n" | |
f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n" | |
f"Fallback 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}" | |
) | |
async def telegram_webhook(request: Request) -> Response: | |
"""Handles incoming updates from Telegram.""" | |
global WEBHOOK_SECRET | |
if not ptb_app: logger.error("Webhook received but PTB app not initialized."); return PlainTextResponse('Bot not initialized', status_code=503) | |
if not ptb_app.running: logger.warning("Webhook received but PTB app not running."); return PlainTextResponse('Bot not running', status_code=503) | |
if WEBHOOK_SECRET: | |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token") | |
if token_header != WEBHOOK_SECRET: | |
logger.warning(f"Webhook received invalid secret token. Header: '{token_header}'") | |
return Response(content="Invalid secret token", status_code=403) | |
try: | |
update_data = await request.json() | |
update = Update.de_json(data=update_data, bot=ptb_app.bot) | |
logger.debug(f"Processing update_id: {update.update_id} via webhook") | |
await ptb_app.process_update(update) | |
return Response(status_code=200) | |
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400) | |
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK to prevent TG retries | |
# --- Starlette App Definition (Unchanged) --- | |
app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] ) | |
logger.info("Starlette ASGI application created with health check and webhook routes.") | |
# --- Development Server (Unchanged) --- | |
if __name__ == '__main__': | |
import uvicorn | |
logger.warning("Running in development mode using Uvicorn directly - NOT for production!") | |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower() | |
local_port = int(os.environ.get('PORT', 8080)) | |
uvicorn.run( "__main__:app", host='0.0.0.0', port=local_port, log_level=log_level, reload=True ) |