Spaces:
Running
Running
# main.py (Corrected SyntaxError at line 424 - Now with Gemini 2.0 as primary AND new Apify scraping fallbacks) | |
import os | |
import re | |
import logging | |
import asyncio | |
import json | |
import html | |
import contextlib | |
import traceback | |
import urllib.parse # Added for URL encoding | |
from typing import Optional, Dict, Any, Tuple | |
# --- Frameworks --- | |
from starlette.applications import Starlette | |
from starlette.routing import Route | |
from starlette.responses import PlainTextResponse, JSONResponse, Response | |
from starlette.requests import Request | |
# --- Telegram Bot --- | |
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot | |
from telegram.ext import ( | |
Application, | |
CommandHandler, | |
MessageHandler, | |
filters, | |
ContextTypes, | |
CallbackQueryHandler, | |
) | |
from telegram.constants import ParseMode | |
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError | |
from telegram.request import HTTPXRequest, BaseRequest | |
# --- Other Libraries --- | |
import httpx | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
from bs4 import BeautifulSoup | |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log | |
try: | |
import lxml | |
DEFAULT_PARSER = 'lxml' | |
except ImportError: | |
DEFAULT_PARSER = 'html.parser' | |
# --- Google Gemini --- | |
try: | |
import google.generativeai as genai | |
from google.generativeai.types import HarmCategory, HarmBlockThreshold | |
_gemini_available = True | |
except ImportError: | |
genai = None | |
HarmCategory = None | |
HarmBlockThreshold = None | |
_gemini_available = False | |
# logger will be defined later, log warning after logger setup | |
# --- Logging Setup --- | |
logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO ) | |
logging.getLogger("httpx").setLevel(logging.WARNING) | |
logging.getLogger("telegram.ext").setLevel(logging.INFO) | |
logging.getLogger('telegram.bot').setLevel(logging.INFO) | |
logging.getLogger("urllib3").setLevel(logging.INFO) | |
logging.getLogger('gunicorn.error').setLevel(logging.INFO) | |
logging.getLogger('uvicorn').setLevel(logging.INFO) | |
logging.getLogger('starlette').setLevel(logging.INFO) | |
if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING) | |
logger = logging.getLogger(__name__) | |
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}") | |
if not _gemini_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.") | |
# --- Global variable for PTB app --- | |
ptb_app: Optional[Application] = None | |
# --- Environment Variable Loading & Configuration --- | |
logger.info("Attempting to load secrets and configuration...") | |
def get_secret(secret_name): | |
value = os.environ.get(secret_name) | |
if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)") | |
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}") | |
return value | |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN') | |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Summarizer Fallback | |
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Scrape Fallback 1 | |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # YT Fallback 1 | |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # YT Fallback 2 + Scrape Fallbacks 4 & 5 (NEW role) | |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY') # Scrape Fallbacks 2 & 3 | |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET') | |
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer | |
# Models (User can still configure via env vars) | |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model | |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # Default YT Actor | |
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Model | |
# Specific Actor IDs for Website Scraping Fallbacks | |
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Fallback 4 | |
APIFY_TEXT_SCRAPER_ACTOR_ID = "karamelo/text-scraper-free" # Fallback 5 | |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.") | |
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.") | |
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.") | |
if not RAPIDAPI_KEY: logger.warning("⚠️ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks (2 & 3) will be unavailable.") | |
if not APIFY_API_TOKEN: logger.warning("⚠️ WARNING: APIFY_API_TOKEN not found. YT transcript fallback (2) and Website scraping fallbacks (4 & 5) will be unavailable.") # Updated warning | |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY) | |
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.") | |
elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY not found or empty. Gemini disabled.") | |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY) | |
if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.") | |
if not URLTOTEXT_API_KEY: logger.warning("Optional secret 'URLTOTEXT_API_KEY' not found. Web scraping fallback 1 unavailable.") | |
if not SUPADATA_API_KEY: logger.warning("Optional secret 'SUPADATA_API_KEY' not found. YT transcript fallback 1 unavailable.") | |
# APIFY_API_TOKEN warning handled above | |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.") | |
logger.info("Secret loading and configuration check finished.") | |
logger.info(f"Using Gemini Model (Primary Summarizer): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}") | |
logger.info(f"Using OpenRouter Model (Fallback Summarizer): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}") | |
logger.info(f"Using Apify Actor (YT Default): {APIFY_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}") | |
_apify_token_exists = bool(APIFY_API_TOKEN) | |
_urltotext_key_exists = bool(URLTOTEXT_API_KEY) | |
_rapidapi_key_exists = bool(RAPIDAPI_KEY) | |
if _gemini_primary_enabled: | |
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.") | |
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_primary_enabled = False | |
# --- Retry Decorator --- | |
async def retry_bot_operation(func, *args, **kwargs): | |
try: return await func(*args, **kwargs) | |
except BadRequest as e: | |
ignore_errors = [ "message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user", ] | |
if any(err in str(e).lower() for err in ignore_errors): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None | |
logger.error(f"Potentially critical BadRequest: {e}"); raise | |
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise | |
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise | |
# --- Helper Functions --- | |
def is_youtube_url(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match) | |
def extract_youtube_id(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url) | |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id | |
else: logger.warning(f"Could not extract YT ID from {url}"); return None | |
# --- Content Fetching Functions --- | |
# --- YouTube Transcript Fetching (Unchanged) --- | |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]: | |
if not video_id: logger.error("[Supadata] No video_id provided"); return None | |
if not api_key: logger.error("[Supadata] API key missing."); return None | |
logger.info(f"[YT Fallback 1] Attempting fetch for video ID: {video_id} via Supadata") | |
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript" | |
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key} | |
try: | |
async with httpx.AsyncClient(timeout=30.0) as client: | |
response = await client.get(api_endpoint, headers=headers, params=params) | |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}") | |
if response.status_code == 200: | |
try: | |
data = response.json() if response.text else None # Check if text exists before json decode | |
content = None | |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data") | |
if not content and response.text: content = response.text # Fallback to raw text if json parse fails or content key missing | |
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None | |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None | |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None | |
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None | |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None | |
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None | |
except httpx.RequestError as e: | |
if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}") | |
else: logger.error(f"[Supadata] Request error for {video_id}: {e}") | |
return None | |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None | |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]: | |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor.""" | |
global APIFY_ACTOR_ID # Uses the default YT actor ID | |
if not video_url: logger.error("[Apify YT] No video_url provided"); return None | |
if not api_token: logger.error("[Apify YT] API token missing."); return None | |
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items" | |
params = {"token": api_token} | |
# Input specific to karamelo~youtube-transcripts actor | |
payload = { | |
"urls": [video_url], | |
"outputFormat": "singleStringText", | |
"maxRetries": 5, | |
"channelHandleBoolean": False, | |
"channelNameBoolean": False, | |
"datePublishedBoolean": False, | |
"relativeDateTextBoolean": False, | |
} | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=120.0) as client: # Long timeout for potential YT processing | |
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}") | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload) | |
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0] | |
content = None | |
# Check common keys for transcript text | |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"] | |
elif "text" in item and isinstance(item["text"], str): content = item["text"] | |
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"] | |
elif "captions" in item and isinstance(item["captions"], list): # Handle list format if needed | |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text")) | |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"]) | |
if content and isinstance(content, str): logger.info(f"[Apify YT] Success via REST for {video_url}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Apify YT] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None | |
else: logger.warning(f"[Apify YT] Actor success but dataset was empty for {video_url}. Response: {results}"); return None | |
except json.JSONDecodeError: logger.error(f"[Apify YT] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None | |
except Exception as e: logger.error(f"[Apify YT] Error processing success response for {video_url}: {e}", exc_info=True); return None | |
elif response.status_code == 400: logger.error(f"[Apify YT] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None | |
elif response.status_code == 401: logger.error("[Apify YT] Auth error (401). Check token."); return None | |
elif response.status_code == 404: logger.error(f"[Apify YT] Endpoint/Actor Not Found (404). Actor: {APIFY_ACTOR_ID} Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Apify YT] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException as e: logger.error(f"[Apify YT] Timeout during API interaction for {video_url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Apify YT] HTTP Status Error during API interaction for {video_url}: {e}"); return None | |
except httpx.RequestError as e: logger.error(f"[Apify YT] Request error during API interaction for {video_url}: {e}"); return None | |
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None | |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]: | |
global SUPADATA_API_KEY, APIFY_API_TOKEN | |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None | |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})") | |
transcript_text = None | |
# Method 1: youtube-transcript-api (Primary) | |
logger.info("[Primary YT] Attempting youtube-transcript-api...") | |
try: | |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] ) | |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item]) | |
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text | |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None | |
except NoTranscriptFound: logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.") | |
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.") | |
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None | |
# Method 2: Supadata (Fallback 1) | |
if transcript_text is None: | |
logger.info("[Fallback YT 1] Trying Supadata API...") | |
if SUPADATA_API_KEY: | |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY) | |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.") | |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.") | |
# Method 3: Apify (Fallback 2 - Default YT Actor) | |
if transcript_text is None: | |
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...") | |
if _apify_token_exists: # Use the global flag | |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN) | |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.") | |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.") | |
# Final Result | |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None | |
return transcript_text | |
# --- Website Content Fetching (MODIFIED SECTION) --- | |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]: | |
"""Directly fetches URL content using httpx. (Primary Web Method - Fetching part)""" | |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' } | |
try: | |
# Use a dedicated client for this potentially short-lived request | |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client: | |
logger.debug(f"[Web Scrape Direct] Sending GET request to {url}") | |
response = await client.get(url) | |
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}") | |
response.raise_for_status() # Raise HTTPStatusError for 4xx/5xx | |
content_type = response.headers.get('content-type', '').lower() | |
if 'html' not in content_type: | |
logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}") | |
return None | |
try: | |
# Attempt to decode text, handle potential errors | |
return response.text | |
except Exception as e: | |
logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}") | |
return None | |
except httpx.HTTPStatusError as e: | |
logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}") | |
except httpx.TimeoutException: | |
logger.error(f"[Web Scrape Direct] Timeout error fetching {url}") | |
except httpx.TooManyRedirects: | |
logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}") | |
except httpx.RequestError as e: | |
# Catch other request-related errors (DNS issues, connection refused, etc.) | |
logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}") | |
except Exception as e: | |
# Catch any other unexpected errors during the request | |
logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True) | |
return None | |
async def get_website_content(url: str) -> Optional[str]: | |
"""Primary method: Fetches HTML directly and parses with BeautifulSoup.""" | |
if not url: logger.error("[Web Scrape Primary] No URL provided"); return None | |
logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}") | |
html_content = await fetch_url_content_for_scrape(url) | |
if not html_content: | |
logger.warning(f"[Web Scrape Primary] Direct fetch failed for {url}.") | |
return None | |
try: | |
# --- Parsing logic (run in thread to avoid blocking) --- | |
def parse_html(content: str) -> Optional[str]: | |
try: | |
soup = BeautifulSoup(content, DEFAULT_PARSER) | |
# Remove common non-content tags more aggressively | |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]): | |
element.extract() | |
# Try to find main content areas with more specific selectors first | |
main_content = soup.find('main') or \ | |
soup.find('article') or \ | |
soup.find(role='main') or \ | |
soup.find(id=re.compile(r'content|main|body', re.I)) or \ | |
soup.find(class_=re.compile(r'content|main|body|article|post', re.I)) | |
target_element = main_content if main_content else soup.body | |
if not target_element: | |
logger.warning(f"[Web Scrape Primary Parse] Could not find body or main content container for {url}") | |
return None # Can't parse if no body/main | |
# Get text, clean up whitespace, join lines | |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()] | |
text = " ".join(lines) | |
# Basic check for minimal content length | |
if not text or len(text) < 50: # Adjust threshold as needed | |
logger.warning(f"[Web Scrape Primary Parse] Extracted text seems too short or empty after cleaning for {url}. Length: {len(text)}") | |
return None | |
return text | |
except Exception as parse_e: | |
# Log errors specific to parsing | |
logger.error(f"[Web Scrape Primary Parse] Error during BeautifulSoup parsing for {url}: {parse_e}", exc_info=False) # Keep log cleaner | |
return None | |
# --- End parsing logic --- | |
text_content = await asyncio.to_thread(parse_html, html_content) | |
if text_content: | |
logger.info(f"[Web Scrape Primary] Success via direct fetch & parse for {url} (final len: {len(text_content)})") | |
return text_content | |
else: | |
# Parsing failed or produced no usable content | |
logger.warning(f"[Web Scrape Primary] Parsing failed or yielded no content for {url}.") | |
return None | |
except Exception as e: | |
# Catch errors related to running the thread or other unexpected issues | |
logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True) | |
return None | |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 1: Fetches website content using urltotext.com API.""" | |
if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None | |
logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API") | |
api_endpoint = "https://urltotext.com/api/v1/urltotext/" | |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False } | |
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" } | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[Web Scrape Fallback 1] Sending request to urltotext.com API for {url}") | |
response = await client.post(api_endpoint, headers=headers, json=payload) | |
logger.debug(f"[Web Scrape Fallback 1] Received status {response.status_code} from urltotext.com API for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning") | |
if warning: logger.warning(f"[Web Scrape Fallback 1] urltotext.com API Warning for {url}: {warning}") | |
if content and isinstance(content, str): logger.info(f"[Web Scrape Fallback 1] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip() | |
else: logger.warning(f"[Web Scrape Fallback 1] urltotext.com API success but content empty for {url}. Resp: {data}"); return None | |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 1] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Web Scrape Fallback 1] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 1] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout connecting to urltotext.com API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 1] Request error connecting to urltotext.com API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 2: Fetches website content using Scraper's Proxy Parser via RapidAPI.""" | |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API") | |
api_host = "scrapers-proxy2.p.rapidapi.com" | |
encoded_url = urllib.parse.quote(url, safe='') # URL Encode the target URL | |
api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true" | |
headers = { | |
"x-rapidapi-host": api_host, | |
"x-rapidapi-key": api_key, | |
"accept-encoding": "gzip" # Recommended by API docs | |
} | |
try: | |
async with httpx.AsyncClient(timeout=40.0) as client: # Increased timeout slightly | |
logger.debug(f"[Web Scrape Fallback 2] Sending GET request to {api_host} for {url}") | |
response = await client.get(api_endpoint, headers=headers) | |
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
# Try to extract content, potentially combining title and content | |
content = data.get("content") | |
title = data.get("title") | |
extracted_text = "" | |
if title and isinstance(title, str): extracted_text += title.strip() + ". " | |
if content and isinstance(content, str): extracted_text += content.strip() | |
if extracted_text and len(extracted_text) > 30: # Add basic length check | |
logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Parser API for {url}. Len: {len(extracted_text)}") | |
return extracted_text | |
else: | |
logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy API success but content/title seems empty or too short for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}") | |
return None | |
except json.JSONDecodeError: | |
logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}") | |
return None | |
except Exception as e: | |
logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True) | |
return None | |
# Handle RapidAPI specific errors if known, otherwise general errors | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 2] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None # API itself failed | |
else: logger.error(f"[Web Scrape Fallback 2] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 2] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 2] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 3: Fetches website content using AI Web Scraper via RapidAPI.""" | |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API") | |
api_host = "ai-web-scraper.p.rapidapi.com" | |
api_endpoint = f"https://{api_host}/extract_content/v1" | |
headers = { | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'x-rapidapi-host': api_host, | |
'x-rapidapi-key': api_key | |
} | |
# Data needs to be form-encoded, httpx handles this with `data=` param | |
payload = {'url': url} | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: # Slightly longer timeout for potential AI processing | |
logger.debug(f"[Web Scrape Fallback 3] Sending POST request to {api_host} for {url}") | |
response = await client.post(api_endpoint, headers=headers, data=payload) | |
logger.debug(f"[Web Scrape Fallback 3] Received status {response.status_code} from {api_host} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
# Infer response structure - Try common keys for content | |
content = None | |
if isinstance(data, dict): | |
content = data.get("content") or data.get("text") or data.get("extracted_text") or data.get("result") | |
elif isinstance(data, str): # If the response *is* the string content | |
content = data | |
if content and isinstance(content, str) and len(content) > 30: # Add basic length check | |
logger.info(f"[Web Scrape Fallback 3] Success via AI Web Scraper API for {url}. Len: {len(content)}") | |
return content.strip() | |
else: | |
keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else f"Type: {type(data)}" | |
content_len = len(content) if content and isinstance(content, str) else 0 | |
logger.warning(f"[Web Scrape Fallback 3] AI Web Scraper API success but content empty/short/invalid format for {url}. {keys_info}. Length: {content_len}") | |
return None | |
except json.JSONDecodeError: | |
# Sometimes APIs might return plain text on error or success without JSON | |
raw_text = response.text | |
if raw_text and len(raw_text) > 30: | |
logger.warning(f"[Web Scrape Fallback 3] Failed JSON decode for AI Web Scraper, but found raw text content. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}") | |
return raw_text.strip() | |
else: | |
logger.error(f"[Web Scrape Fallback 3] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}") | |
return None | |
except Exception as e: | |
logger.error(f"[Web Scrape Fallback 3] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True) | |
return None | |
# Handle RapidAPI specific errors | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check API subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 3] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 3] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
# --- NEW Apify Website Scraping Fallbacks --- | |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]: | |
"""Generic function to run an Apify actor and get text content.""" | |
if not url: logger.error(f"[{actor_name}] No URL provided"); return None | |
if not api_token: logger.error(f"[{actor_name}] API token missing."); return None | |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items" | |
params = {"token": api_token} | |
# Standard input for crawler-like actors: startUrls | |
# Provide sensible defaults if the actor supports them (can be overridden via env vars if needed later) | |
run_input = { | |
"startUrls": [{"url": url}], | |
# Add other potentially useful defaults if known for the actor, e.g.: | |
"maxCrawlPages": 1, # Only fetch the starting page for summarization | |
"crawlerType": "playwright:firefox" # Default to browser for better JS handling, adjust if needed | |
} | |
# Special handling for Text Scraper Free - might expect simpler input | |
if actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID: | |
# Try a simpler input structure based on its description if startUrls fails | |
# run_input = {"urls": [url]} # Alternative input structure - test if startUrls doesn't work | |
run_input = { "urls": [url] } # Stick with the basic input structure suggested by the description | |
logger.debug(f"[{actor_name}] Using simplified input: {run_input}") | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=120.0) as client: # Long timeout for potential crawling/rendering | |
logger.debug(f"[{actor_name}] POST Request to {sync_items_endpoint} for {url}") | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input) | |
logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0] | |
content = None | |
# Check common keys for extracted text content | |
if "text" in item and isinstance(item["text"], str): content = item["text"] | |
elif "content" in item and isinstance(item["content"], str): content = item["content"] | |
elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"] # Use Markdown if text is absent | |
elif "html" in item and isinstance(item["html"], str): # Last resort: try parsing HTML if text/MD missing | |
logger.warning(f"[{actor_name}] No 'text' or 'markdown' found, attempting to parse 'html' from result.") | |
soup = BeautifulSoup(item["html"], DEFAULT_PARSER) | |
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
if content and isinstance(content, str) and len(content) > 30: # Basic length check | |
logger.info(f"[{actor_name}] Success via REST for {url}. Length: {len(content)}") | |
return content.strip() | |
else: | |
content_len = len(content) if content and isinstance(content, str) else 0 | |
logger.warning(f"[{actor_name}] Dataset item parsed but text content empty/short/invalid format for {url}. Item keys: {list(item.keys())}. Length: {content_len}") | |
return None | |
else: | |
logger.warning(f"[{actor_name}] Actor success but dataset was empty for {url}. Response: {results}") | |
return None | |
except json.JSONDecodeError: | |
logger.error(f"[{actor_name}] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}") | |
return None | |
except Exception as e: | |
logger.error(f"[{actor_name}] Error processing success response for {url}: {e}", exc_info=True) | |
return None | |
elif response.status_code == 400: logger.error(f"[{actor_name}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None | |
elif response.status_code == 401: logger.error(f"[{actor_name}] Auth error (401). Check token."); return None | |
elif response.status_code == 404: logger.error(f"[{actor_name}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[{actor_name}] Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException as e: logger.error(f"[{actor_name}] Timeout during API interaction for {url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[{actor_name}] HTTP Status Error during API interaction for {url}: {e}"); return None | |
except httpx.RequestError as e: logger.error(f"[{actor_name}] Request error during API interaction for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 4 (NEW): Fetches website content using Apify Website Content Crawler.""" | |
return await _run_apify_actor_for_web_content( | |
url=url, | |
api_token=api_token, | |
actor_id=APIFY_CRAWLER_ACTOR_ID, | |
actor_name="Apify Crawler" | |
) | |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 5 (NEW): Fetches website content using Apify Text Scraper Free.""" | |
return await _run_apify_actor_for_web_content( | |
url=url, | |
api_token=api_token, | |
actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, | |
actor_name="Apify Text Scraper" | |
) | |
# --- Summarization Functions (Unchanged) --- | |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call Gemini API. Returns (summary, error_message).""" | |
global GEMINI_MODEL, _gemini_primary_enabled | |
if not _gemini_primary_enabled: | |
logger.error("[Gemini Primary] Called but is disabled."); | |
return None, "Error: Primary AI service (Gemini) not configured/available." | |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}") | |
# Define prompts (same as before) | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
# Input Length Check | |
MAX_INPUT_LENGTH_GEMINI = 900000 | |
if len(text) > MAX_INPUT_LENGTH_GEMINI: | |
logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
# Safety Settings (Block None) | |
safety_settings = { | |
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, | |
} | |
if hasattr(HarmCategory, 'HARM_CATEGORY_CIVIC_INTEGRITY'): | |
safety_settings[HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY] = HarmBlockThreshold.BLOCK_NONE | |
logger.debug(f"[Gemini Primary] Using safety settings: { {k.name: v.name for k, v in safety_settings.items()} }") | |
try: | |
logger.debug(f"[Gemini Primary] Initializing model {GEMINI_MODEL}") | |
model = genai.GenerativeModel(GEMINI_MODEL) | |
logger.info(f"[Gemini Primary] Sending request to Gemini ({GEMINI_MODEL})...") | |
request_options = {"timeout": 120} | |
response = await model.generate_content_async( | |
full_prompt, | |
generation_config=genai.types.GenerationConfig(), # Basic config | |
safety_settings=safety_settings, | |
request_options=request_options | |
) | |
logger.info("[Gemini Primary] Received response from Gemini.") | |
# Check for immediate blocking reasons | |
if response.prompt_feedback and response.prompt_feedback.block_reason: | |
# Use .name for the enum value if block_reason is an enum, otherwise convert to string | |
block_reason_str = getattr(response.prompt_feedback.block_reason, 'name', str(response.prompt_feedback.block_reason)) | |
logger.warning(f"[Gemini Primary] Request blocked by API. Reason: {block_reason_str}"); | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the request (Reason: {block_reason_str})." | |
# Check candidate-level blocking and extract text safely | |
summary = None | |
finish_reason_str = 'UNKNOWN' | |
if response.candidates: | |
candidate = response.candidates[0] | |
# *** FIX START *** | |
# Use .name attribute of the finish_reason enum for comparison | |
finish_reason_name = getattr(candidate.finish_reason, 'name', None) | |
finish_reason_str = finish_reason_name or 'N/A' # Use name if available | |
if finish_reason_name == 'SAFETY': | |
# *** FIX END *** | |
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in candidate.safety_ratings]) | |
logger.warning(f"[Gemini Primary] Candidate blocked due to SAFETY. Finish Reason: {finish_reason_str}. Ratings: [{safety_ratings_str}]") | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the response due to safety filters ({finish_reason_str})." | |
# *** FIX START *** | |
# Compare names instead of potentially non-existent enum members | |
elif finish_reason_name not in ['STOP', 'MAX_TOKENS', None]: # Also check for None | |
# *** FIX END *** | |
logger.warning(f"[Gemini Primary] Candidate finished with non-standard reason: {finish_reason_str}") | |
# Safely access content text | |
if candidate.content and candidate.content.parts: | |
summary = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text')) | |
# Fallback check via response.text | |
if summary is None: | |
try: | |
summary = response.text | |
except ValueError as e: | |
logger.warning(f"[Gemini Primary] Error accessing response.text (likely blocked content based on previous checks): {e}"); | |
summary = None | |
if summary: | |
logger.info(f"[Gemini Primary] Success generating summary. Finish Reason: {finish_reason_str}. Output len: {len(summary)}"); | |
return summary.strip(), None | |
else: | |
# If summary is still None, report the finish reason found earlier | |
logger.warning(f"[Gemini Primary] Gemini returned empty summary or content was blocked. Final Finish Reason: {finish_reason_str}"); | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) did not provide a summary (Finish Reason: {finish_reason_str})." | |
except AttributeError as ae: | |
# Catch potential AttributeErrors during response processing if SDK structure differs | |
logger.error(f"[Gemini Primary] AttributeError during Gemini response processing: {ae}. SDK might be incompatible or response structure unexpected.", exc_info=True); | |
return None, f"Sorry, there was an issue processing the response from the primary AI service ({GEMINI_MODEL})." | |
except Exception as e: | |
logger.error(f"[Gemini Primary] Unexpected error during Gemini API call: {e}", exc_info=True); | |
return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GEMINI_MODEL})." | |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call OpenRouter API (Fallback). Returns (summary, error_message).""" | |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled | |
if not _openrouter_fallback_enabled: | |
logger.error("[OpenRouter Fallback] Called but is disabled."); | |
return None, "Error: Fallback AI service (OpenRouter) not configured/available." | |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}") | |
# Define prompts (same as Gemini) | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:") | |
# Input Length Check (Estimate based on typical model limits, adjust if needed for specific model) | |
MAX_INPUT_LENGTH_OR = 100000 # Conservative limit, adjust if OPENROUTER_MODEL has known higher limit | |
if len(text) > MAX_INPUT_LENGTH_OR: | |
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}) for {OPENROUTER_MODEL}. Truncating."); | |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
headers = { | |
"Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
"Content-Type": "application/json", | |
"HTTP-Referer": "https://github.com/your-repo", # Optional: Identify your app | |
"X-Title": "TelegramSummariserBot" # Optional: Identify your app | |
} | |
payload = { | |
"model": OPENROUTER_MODEL, | |
"messages": [{"role": "user", "content": full_prompt}], | |
# Optional params: | |
# "max_tokens": 1024, | |
# "temperature": 0.7, | |
} | |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions" | |
# Use reasonable timeouts for fallback | |
api_timeouts = httpx.Timeout(connect=10.0, read=45.0, write=10.0, pool=60.0) # Slightly longer read timeout | |
response = None | |
try: | |
async with httpx.AsyncClient(timeout=api_timeouts) as client: | |
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_MODEL}) with read timeout {api_timeouts.read}s...") | |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload) | |
logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0: | |
choice = data["choices"][0] | |
message = choice.get("message") | |
finish_reason = choice.get("finish_reason", "N/A") | |
if message and isinstance(message, dict): | |
summary = message.get("content") | |
if summary: | |
logger.info(f"[OpenRouter Fallback] Success via OpenRouter. Finish: {finish_reason}. Output len: {len(summary)}") | |
return summary.strip(), None | |
else: | |
logger.warning(f"[OpenRouter Fallback] OpenRouter success but content empty. Finish: {finish_reason}. Resp: {data}") | |
return None, f"Sorry, the fallback AI model ({OPENROUTER_MODEL}) returned an empty summary (Finish: {finish_reason})." | |
else: | |
logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Finish: {finish_reason}. Full: {data}") | |
return None, "Sorry, could not parse fallback AI response (message format)." | |
else: | |
error_details = data.get("error", {}) | |
logger.error(f"[OpenRouter Fallback] Unexpected choices structure or error in response: {data.get('choices')}. Error: {error_details}. Full: {data}") | |
return None, f"Sorry, could not parse fallback AI response (choices structure or error: {error_details.get('message', 'Unknown')})." | |
except json.JSONDecodeError: | |
logger.error(f"[OpenRouter Fallback] Failed JSON decode OpenRouter. Status:{response.status_code}. Resp:{response.text[:500]}") | |
return None, "Sorry, failed to understand fallback AI response." | |
except Exception as e: | |
logger.error(f"[OpenRouter Fallback] Error processing OpenRouter success response: {e}", exc_info=True) | |
return None, "Sorry, error processing fallback AI response." | |
# Handle specific error codes | |
elif response.status_code == 401: logger.error("[OpenRouter Fallback] API key invalid (401)."); return None, "Error: Fallback AI model configuration key is invalid." | |
elif response.status_code == 402: logger.error("[OpenRouter Fallback] Payment Required/Quota Exceeded (402)."); return None, f"Sorry, fallback AI service ({OPENROUTER_MODEL}) quota/limit issue." | |
elif response.status_code == 429: logger.warning("[OpenRouter Fallback] Rate Limit Exceeded (429)."); return None, f"Sorry, fallback AI model ({OPENROUTER_MODEL}) is busy. Try again." | |
elif response.status_code == 500: logger.error(f"[OpenRouter Fallback] Internal Server Error (500). Resp:{response.text[:500]}"); return None, f"Sorry, fallback AI service ({OPENROUTER_MODEL}) had an internal error." | |
else: | |
error_info = ""; | |
try: error_info = response.json().get("error", {}).get("message", "") | |
except Exception: pass | |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}"); | |
return None, f"Sorry, fallback AI service ({OPENROUTER_MODEL}) returned unexpected status ({response.status_code})." | |
except httpx.TimeoutException as e: | |
logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}") | |
return None, f"Sorry, the fallback AI service ({OPENROUTER_MODEL}) timed out." | |
except httpx.RequestError as e: | |
logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}") | |
return None, "Sorry, there was an error connecting to the fallback AI model service." | |
except Exception as e: | |
logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True) | |
return None, "Sorry, an unexpected error occurred while using the fallback AI service." | |
async def generate_summary(text: str, summary_type: str) -> str: | |
""" | |
Generates summary using Gemini (Primary) and falls back to OpenRouter if needed. | |
""" | |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL | |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})") | |
final_summary: Optional[str] = None | |
primary_error_message: Optional[str] = None # Store primary error separately | |
# --- Attempt 1: Gemini (Primary) --- | |
if _gemini_primary_enabled: | |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})") | |
final_summary, primary_error_message = await _call_gemini(text, summary_type) | |
if final_summary: | |
logger.info(f"[Summary Generation] Success with primary AI (Gemini).") | |
return final_summary # Return successful primary summary | |
else: | |
logger.warning(f"[Summary Generation] Primary AI (Gemini) failed. Error: {primary_error_message}. Proceeding to fallback.") | |
# Keep primary_error_message to potentially combine later | |
else: | |
logger.warning("[Summary Generation] Primary AI (Gemini) is disabled or unavailable. Proceeding directly to fallback.") | |
primary_error_message = "Primary AI (Gemini) unavailable." # Set initial error message | |
# --- Attempt 2: OpenRouter (Fallback) --- | |
if _openrouter_fallback_enabled: | |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})") | |
fallback_summary, fallback_error_message = await _call_openrouter(text, summary_type) | |
if fallback_summary: | |
logger.info(f"[Summary Generation] Success with fallback AI (OpenRouter).") | |
return fallback_summary # Return successful fallback summary | |
else: | |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error_message}") | |
# Combine error messages: Prioritize primary error if it exists | |
if primary_error_message and "unavailable" not in primary_error_message.lower(): | |
# If primary failed with a specific error (not just unavailable) | |
return f"{primary_error_message} Fallback AI ({OPENROUTER_MODEL}) also failed: {fallback_error_message}" | |
else: | |
# If primary was unavailable or its error was generic | |
return f"Sorry, summarization failed. Primary AI: {primary_error_message or 'N/A'}. Fallback AI ({OPENROUTER_MODEL}): {fallback_error_message}" | |
else: | |
logger.error("[Summary Generation] Fallback AI (OpenRouter) is disabled or unavailable. Cannot proceed.") | |
# Return the primary error if it exists, otherwise a generic message | |
if primary_error_message: | |
return f"{primary_error_message} Fallback AI is also unavailable." | |
else: | |
# Should not happen if logic is correct, but safeguard | |
return "Error: Both primary and fallback AI services for summarization are unavailable." | |
# Should ideally not be reached if logic above is sound | |
logger.error("[Summary Generation] Reached end of function without returning a summary or specific error.") | |
return "Sorry, an unknown error occurred during summary generation after trying all available models." | |
# --- Main Processing Logic (MODIFIED with new fallbacks) --- | |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None: | |
"""Handles the entire process: fetching content (with ALL fallbacks) and summarizing.""" | |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}") | |
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None | |
try: background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 ); bot = Bot(token=bot_token, request=background_request) | |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return | |
content: Optional[str] = None | |
user_feedback_message: Optional[str] = None | |
success: bool = False | |
status_message_id: Optional[int] = message_id_to_edit | |
message_to_delete_later_id : Optional[int] = None | |
try: | |
# --- 1. Initial User Feedback --- | |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (this might take a moment)..." # Updated text | |
if status_message_id: | |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'") | |
except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None | |
if not status_message_id: | |
try: | |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN ) | |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}") | |
else: raise RuntimeError("Failed to send status message after retries.") | |
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise | |
try: | |
# --- 2. Content Fetching (Chain of methods) --- | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}") | |
if is_youtube: | |
# --- YouTube Transcript Logic (Unchanged) --- | |
video_id = extract_youtube_id(url) | |
if video_id: content = await get_youtube_transcript(video_id, url) # Tries lib -> Supadata -> Apify YT Actor | |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format." | |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)." | |
else: | |
# --- Website Scraping Logic (with ALL Fallbacks) --- | |
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN | |
global _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists | |
# Method 1: Primary Scrape (Direct Fetch + BS4) | |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 1 (Direct Fetch + BS4)...") | |
content = await get_website_content(url) | |
# Method 2: Fallback 1 (urltotext.com) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...") | |
if _urltotext_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.") | |
else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.") | |
# Method 3: Fallback 2 (Scraper's Proxy via RapidAPI) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_scrapers_proxy(url, RAPIDAPI_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.") | |
else: logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.") | |
# Method 4: Fallback 3 (AI Web Scraper via RapidAPI) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_ai_web_scraper(url, RAPIDAPI_KEY) | |
if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.") | |
else: logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.") | |
# Method 5: Fallback 4 (Apify Website Content Crawler - NEW) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_crawler(url, APIFY_API_TOKEN) | |
if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.") | |
else: logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.") | |
# Method 6: Fallback 5 (Apify Text Scraper Free - NEW) | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_text_scraper(url, APIFY_API_TOKEN) | |
if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.") | |
else: logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.") | |
# Final check for website content after all methods | |
if not content and not user_feedback_message: | |
user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?)." # Updated message | |
# --- 3. Summarization --- | |
if content: | |
logger.info(f"[Task {task_id}] Content fetched successfully (len:{len(content)}). Generating summary.") | |
# Update status message before summarization | |
try: | |
status_update_msg_id = message_to_delete_later_id or status_message_id | |
if status_update_msg_id: | |
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_update_msg_id, text=f"Content fetched! Now generating '{summary_type}' summary...", parse_mode=ParseMode.MARKDOWN, reply_markup=None ) | |
except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
final_summary = await generate_summary(content, summary_type) # Calls Gemini -> OpenRouter | |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): | |
user_feedback_message = final_summary # Use the error message from summarizer | |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}") | |
else: | |
# Split long messages if needed | |
max_length = 4096 | |
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)] | |
# Send summary parts (using ParseMode=None as AI prompt instructs no Markdown formatting) | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} ) | |
for part in summary_parts[1:]: | |
await asyncio.sleep(0.5) # Small delay between parts | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} ) | |
success = True | |
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).") | |
user_feedback_message = None # Clear any previous error message if summarization succeeded | |
# --- 4. Handle Final Failure Feedback --- | |
if user_feedback_message: # If any step failed and set a message | |
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}") | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} ) | |
except Exception as e: | |
# Catch unexpected errors during the inner try block (fetching/summarizing) | |
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True) | |
user_feedback_message = "Oops! Something went really wrong during processing. Please try again later." | |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message ) | |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.") | |
except Exception as outer_e: | |
# Catch errors in the outer setup (bot creation, status message sending) | |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True) | |
try: | |
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred. Could not start processing." ) | |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.") | |
finally: | |
# --- 5. Cleanup --- | |
# Delete the "Processing..." or original button message | |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id | |
if delete_target_id and bot: | |
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id); logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}") | |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}") | |
# Close the background bot's HTTP client | |
if background_request and hasattr(background_request, '_client') and background_request._client: | |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.") | |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}") | |
logger.info(f"[Task {task_id}] Task completed. Success: {success}") | |
# --- Telegram Handlers (Unchanged) --- | |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user; mention = user.mention_html() | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /start.") | |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" ) | |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /help.") | |
help_text = ( "🔍 **How to use this bot:**\n\n" | |
"1. Send me any YouTube video link or website URL.\n" | |
"2. I'll ask how you want it summarised (paragraph or points).\n" | |
"3. Click the button for your choice.\n" | |
"4. Wait while I fetch the content and generate the summary!\n\n" | |
"⚙️ I try multiple methods to get content, especially for tricky websites or YouTube videos without standard transcripts.\n\n" | |
"**Commands:**\n" | |
"`/start` - Display the welcome message\n" | |
"`/help` - Show this help message" ) | |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN) | |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
if not update.message or not update.message.text: return | |
url = update.message.text.strip(); user = update.effective_user | |
if not user: return | |
# Basic URL check - Look for http:// or https:// and at least one dot after protocol | |
url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE) | |
if not url_pattern.search(url): | |
logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}") | |
# Removed reply for non-URLs to avoid spamming | |
return | |
# If multiple URLs are detected, maybe handle only the first one? | |
# For simplicity, we'll process the whole message text as the URL for now. | |
# Refinement: Extract the first valid URL if multiple exist. | |
match = url_pattern.search(url) | |
if match: | |
extracted_url = match.group(0) | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) sent potential URL: {extracted_url}") | |
context.user_data['url_to_summarize'] = extracted_url | |
context.user_data['original_message_id'] = update.message.message_id | |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]] | |
reply_markup = InlineKeyboardMarkup(keyboard) | |
try: | |
await update.message.reply_text( | |
f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?", | |
reply_markup=reply_markup, | |
disable_web_page_preview=True, | |
parse_mode=ParseMode.MARKDOWN | |
) | |
except BadRequest as e: | |
# Handle cases like user blocking the bot after sending URL but before reply | |
if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): | |
logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).") | |
else: | |
logger.error(f"BadRequest replying to URL message from {user.id}: {e}") | |
except Exception as e: | |
logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True) | |
else: # Should not be reached due to initial check, but as safeguard | |
logger.debug(f"Ignoring message from {user.id} that passed initial check but no URL found by regex: {url[:100]}") | |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
query = update.callback_query | |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return | |
user = query.from_user; summary_type = query.data; query_id = query.id | |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'no_username'})") | |
except BadRequest as e: | |
if "query is too old" in str(e).lower(): logger.warning(f"Callback query {query_id} is too old to answer."); return # Don't proceed with old queries | |
else: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) | |
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) | |
# Check if URL still exists in context (might be cleared or from old message) | |
url = context.user_data.get('url_to_summarize'); message_id_to_edit = query.message.message_id | |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}") | |
if not url: | |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Might be an old button click.") | |
try: | |
# Try to edit the message to inform the user, but expect it might fail if message is old/deleted | |
await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.") | |
except BadRequest as e: | |
# Ignore common errors when editing old messages | |
if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass | |
else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}") | |
except Exception as e: | |
logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}") | |
return # Stop processing if context is lost | |
# Clear context *after* successfully retrieving URL for this specific callback | |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}") | |
# Check essential configurations before scheduling task | |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled | |
if not TELEGRAM_TOKEN: | |
logger.critical("TELEGRAM_TOKEN missing in callback!") | |
try: await query.edit_message_text(text="❌ Bot configuration error (Token Missing). Cannot proceed.") | |
except Exception: pass | |
return | |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled: | |
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!") | |
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models available.") | |
except Exception: pass | |
return | |
# Log warnings if one model is unavailable but don't stop the process | |
elif not _gemini_primary_enabled: | |
logger.warning("Primary AI (Gemini) is unavailable, will rely on fallback summarizer.") | |
elif not _openrouter_fallback_enabled: | |
logger.warning("Fallback AI (OpenRouter) is unavailable, will rely on primary summarizer.") | |
logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}") | |
# Schedule the background task | |
asyncio.create_task( | |
process_summary_task( | |
user_id=user.id, | |
chat_id=query.message.chat_id, | |
message_id_to_edit=message_id_to_edit, # Pass the button message ID | |
url=url, | |
summary_type=summary_type, | |
bot_token=TELEGRAM_TOKEN | |
), | |
name=f"SummaryTask-{user.id}-{message_id_to_edit}" | |
) | |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Log Errors caused by Updates.""" | |
logger.error("Exception while handling an update:", exc_info=context.error) | |
# You could add more specific handling or user notification here if needed | |
# --- Application Setup & Web Framework (Unchanged) --- | |
async def setup_bot_config() -> Application: | |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN | |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.") | |
# Configure timeouts for bot requests | |
custom_request = HTTPXRequest( | |
connect_timeout=10.0, | |
read_timeout=30.0, # Timeout for reading response from Telegram | |
write_timeout=30.0, # Timeout for sending request to Telegram | |
pool_timeout=60.0 # Timeout for getting a connection from the pool | |
) | |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build() | |
# Command Handlers | |
application.add_handler(CommandHandler("start", start)) | |
application.add_handler(CommandHandler("help", help_command)) | |
# Message Handler for URLs (more specific filter) | |
# Matches messages containing URL entities or text_link entities (Markdown links) | |
url_filter = filters.Entity("url") | filters.Entity("text_link") | |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND & url_filter, handle_potential_url)) | |
# Callback Query Handler (Buttons) | |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback)) | |
# Error Handler | |
application.add_error_handler(error_handler) | |
logger.info("Telegram application handlers configured."); return application | |
async def lifespan(app: Starlette): | |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN | |
logger.info("ASGI Lifespan: Startup initiated..."); | |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.") | |
bot_setup_successful = False | |
try: | |
ptb_app = await setup_bot_config() | |
await ptb_app.initialize() | |
bot_info = await ptb_app.bot.get_me() | |
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})") | |
bot_setup_successful = True # Mark bot as setup | |
# --- Webhook Setup --- | |
current_webhook_info = await ptb_app.bot.get_webhook_info() | |
if current_webhook_info and current_webhook_info.url: | |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete...") | |
try: | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Existing webhook deleted.") | |
else: logger.warning("Failed delete existing webhook (API returned False). Might be okay.") | |
except Exception as e: logger.warning(f"Could not delete existing webhook: {e}. Proceeding anyway."); await asyncio.sleep(1) | |
space_host = os.environ.get("SPACE_HOST") | |
webhook_path = "/webhook" # Ensure this matches the Route below | |
full_webhook_url = None | |
if space_host: | |
protocol = "https" | |
host = space_host.split('://')[-1] | |
full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}" | |
if full_webhook_url: | |
logger.info(f"Setting webhook to: {full_webhook_url}") | |
set_webhook_args = { | |
"url": full_webhook_url, | |
"allowed_updates": Update.ALL_TYPES, | |
"drop_pending_updates": True | |
} | |
if WEBHOOK_SECRET: | |
set_webhook_args["secret_token"] = WEBHOOK_SECRET | |
logger.info("Webhook secret token is configured.") | |
await asyncio.sleep(1.5) # Slightly longer delay before setting new webhook | |
try: | |
await ptb_app.bot.set_webhook(**set_webhook_args) | |
webhook_info = await ptb_app.bot.get_webhook_info() # Verify setting | |
if webhook_info.url == full_webhook_url: | |
logger.info(f"Webhook successfully set and verified: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}") | |
else: | |
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'.") | |
# Decide if this is fatal. For now, log error and continue. | |
# raise RuntimeError("Webhook URL mismatch after setting.") | |
await ptb_app.start() # Start listening for updates via webhook | |
logger.info("PTB Application started in webhook mode.") | |
except Exception as e: | |
logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True) | |
raise RuntimeError(f"Failed to set webhook: {e}") from e | |
else: | |
logger.critical("Could not construct webhook URL from SPACE_HOST.") | |
raise RuntimeError("Webhook URL could not be determined.") | |
else: | |
logger.critical("SPACE_HOST environment variable not found. Cannot set webhook automatically.") | |
raise RuntimeError("SPACE_HOST environment variable is missing.") | |
logger.info("ASGI Lifespan: Startup complete."); yield # --- Application runs here --- | |
except Exception as startup_err: | |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True) | |
# Ensure cleanup even if startup fails partially | |
if ptb_app and bot_setup_successful: | |
if ptb_app.running: await ptb_app.stop() | |
await ptb_app.shutdown() | |
raise # Propagate error to stop Starlette | |
finally: | |
# --- Shutdown Logic --- | |
logger.info("ASGI Lifespan: Shutdown initiated...") | |
if ptb_app and bot_setup_successful: # Only cleanup if bot was successfully set up initially | |
if ptb_app.running: logger.info("Stopping PTB Application..."); await ptb_app.stop() | |
logger.info("Shutting down PTB Application..."); await ptb_app.shutdown(); logger.info("PTB Application shut down.") | |
# Attempt to clean up webhook on graceful shutdown | |
try: | |
logger.info("Attempting to delete webhook on shutdown...") | |
# Check if bot object still exists and has delete_webhook method | |
if ptb_app.bot and hasattr(ptb_app.bot, 'delete_webhook'): | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): | |
logger.info("Webhook deleted on shutdown.") | |
else: | |
logger.warning("Failed to delete webhook on shutdown (API returned False).") | |
else: | |
logger.warning("Cannot delete webhook: Bot object unavailable.") | |
except Exception as e: | |
logger.warning(f"Could not delete webhook during shutdown: {e}") | |
else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.") | |
logger.info("ASGI Lifespan: Shutdown complete.") | |
async def health_check(request: Request) -> PlainTextResponse: | |
"""Simple health check endpoint.""" | |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled | |
global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY # Added missing globals | |
bot_status = "Not Initialized" | |
bot_username = "N/A" | |
# Check if ptb_app exists first | |
if ptb_app: | |
try: | |
# Check if the application is running | |
if ptb_app.running: | |
# Try to get bot info if running | |
# Use a shorter timeout specifically for the health check get_me if possible, | |
# otherwise rely on the default bot request timeout. | |
# Note: ptb_app.bot.get_me doesn't accept a timeout directly. | |
bot_info = await ptb_app.bot.get_me() | |
bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error" | |
bot_status = "Running" | |
else: | |
# If ptb_app exists but isn't running, it's likely initialized but stopped or not started yet | |
bot_status = "Initialized but Not Running" | |
# Try getting bot info even if not running, might work if initialized | |
if ptb_app.bot: | |
try: | |
bot_info = await ptb_app.bot.get_me() | |
bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error" | |
except Exception: | |
bot_username = "Info Fetch Error (Not Running)" | |
except TimedOut: | |
bot_status = "Timeout checking status" | |
logger.warning("Health check: Timeout getting bot info.") | |
except NetworkError as ne: | |
bot_status = f"Network Error checking status: {ne}" | |
logger.warning(f"Health check: NetworkError getting bot info: {ne}") | |
except Exception as e: | |
# Catch other potential errors during status check | |
bot_status = f"Error checking status: {type(e).__name__}" | |
logger.warning(f"Health check: Error getting bot info: {e}") | |
else: | |
# ptb_app itself is None (likely very early startup or major failure) | |
bot_status = "Not Initialized" | |
bot_username = "N/A" | |
return PlainTextResponse( | |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n" | |
f"Primary Model (Gemini): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}\n" | |
f"Fallback Model (OpenRouter): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n" | |
f"YT Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n" # Check key existence | |
f"YT Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n" | |
f"Web Scrape 1 (Direct+BS4): Enabled\n" | |
f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n" | |
f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n" | |
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}" | |
) | |
async def telegram_webhook(request: Request) -> Response: | |
"""Handles incoming updates from Telegram.""" | |
global WEBHOOK_SECRET | |
if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503) | |
if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503) | |
# --- Security Check --- | |
if WEBHOOK_SECRET: | |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token") | |
if token_header != WEBHOOK_SECRET: | |
logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'") | |
return Response(content="Invalid secret token", status_code=403) # Forbidden | |
# --- Process Update --- | |
try: | |
update_data = await request.json() | |
update = Update.de_json(data=update_data, bot=ptb_app.bot) | |
logger.debug(f"Processing update_id: {update.update_id} via webhook") | |
# Use process_update which handles queuing internally | |
await ptb_app.process_update(update) | |
return Response(status_code=200) # OK - Tell Telegram we received it and will process | |
except json.JSONDecodeError: | |
logger.error("Webhook received invalid JSON.") | |
return PlainTextResponse('Bad Request: Invalid JSON', status_code=400) | |
except Exception as e: | |
logger.error(f"Error processing webhook update: {e}", exc_info=True) | |
# Still return 200 OK to Telegram to prevent retries for this specific update | |
return Response(status_code=200) | |
# --- Starlette App Definition --- | |
app = Starlette( | |
debug=False, # Keep False for production/Hugging Face | |
lifespan=lifespan, | |
routes=[ | |
Route("/", endpoint=health_check, methods=["GET"]), | |
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), # Must match webhook_path | |
] | |
) | |
logger.info("Starlette ASGI application created with health check and webhook routes.") | |
# --- Development Server (if run directly) --- | |
if __name__ == '__main__': | |
import uvicorn | |
logger.warning("Running in development mode using Uvicorn directly - NOT for production!") | |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower() | |
local_port = int(os.environ.get('PORT', 8080)) | |
uvicorn.run( | |
"__main__:app", | |
host='0.0.0.0', | |
port=local_port, | |
log_level=log_level, | |
reload=True, # Enable auto-reload for local development convenience | |
# Consider adding reload_dirs=['.'] if reload doesn't pick up changes reliably | |
) |