Spaces:
Running
Running
# main.py (Full Code - Specific April 2025 Models: Llama 4 Scout & DeepSeek V3 Free - FIXED) | |
import os | |
import re | |
import logging | |
import asyncio | |
import json | |
import html | |
import contextlib | |
import traceback | |
import urllib.parse | |
from typing import Optional, Dict, Any, Tuple | |
# --- Frameworks --- | |
from starlette.applications import Starlette | |
from starlette.routing import Route | |
from starlette.responses import PlainTextResponse, JSONResponse, Response | |
from starlette.requests import Request | |
# --- Telegram Bot --- | |
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot | |
from telegram.ext import ( | |
Application, | |
CommandHandler, | |
MessageHandler, | |
filters, | |
ContextTypes, | |
CallbackQueryHandler, | |
) | |
from telegram.constants import ParseMode | |
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError | |
from telegram.request import HTTPXRequest, BaseRequest | |
# --- Other Libraries --- | |
import httpx | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
from bs4 import BeautifulSoup | |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log | |
try: | |
import lxml | |
DEFAULT_PARSER = 'lxml' | |
except ImportError: | |
DEFAULT_PARSER = 'html.parser' | |
# --- Google Gemini --- | |
try: | |
import google.generativeai as genai | |
from google.generativeai.types import HarmCategory, HarmBlockThreshold | |
import google.api_core.exceptions # <-- ADD THIS LINE | |
_gemini_sdk_available = True | |
except ImportError: | |
genai = None | |
HarmCategory = None | |
HarmBlockThreshold = None | |
google = None # Set google to None if core part fails too | |
_gemini_sdk_available = False | |
# --- Groq SDK --- | |
try: | |
from groq import Groq, GroqError | |
_groq_sdk_available = True | |
except ImportError: | |
Groq = None | |
GroqError = None | |
_groq_sdk_available = False | |
# --- Logging Setup --- | |
logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO ) | |
logging.getLogger("httpx").setLevel(logging.WARNING) | |
logging.getLogger("telegram.ext").setLevel(logging.INFO) | |
logging.getLogger('telegram.bot').setLevel(logging.INFO) | |
logging.getLogger("urllib3").setLevel(logging.INFO) | |
logging.getLogger('gunicorn.error').setLevel(logging.INFO) | |
logging.getLogger('uvicorn').setLevel(logging.INFO) | |
logging.getLogger('starlette').setLevel(logging.INFO) | |
if _gemini_sdk_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING) | |
if _groq_sdk_available: logging.getLogger("groq").setLevel(logging.INFO) | |
logger = logging.getLogger(__name__) | |
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}") | |
if not _gemini_sdk_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.") | |
if not _groq_sdk_available: logger.warning("groq library not found. Groq functionality disabled.") | |
# --- Global variable for PTB app --- | |
ptb_app: Optional[Application] = None | |
# --- Environment Variable Loading & Configuration --- | |
logger.info("Attempting to load secrets and configuration...") | |
def get_secret(secret_name): | |
value = os.environ.get(secret_name) | |
if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)") | |
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}") | |
return value | |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN') | |
GROQ_API_KEY = get_secret('GROQ_API_KEY') # For Llama 4 | |
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # For Gemini 2.5 Pro and 2.0 Flash | |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # For DeepSeek | |
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') | |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') | |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') | |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY') | |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET') | |
# --- Model Configurations (Specific April 2025 - Updated Order) --- | |
# New Model Priority: | |
# 1. Gemini 2.5 Flash Preview (NEW - Using specific date variant) | |
# 2. Gemini 2.5 Pro Exp | |
# 3. Gemini 2.0 Flash | |
# 4. OpenRouter DeepSeek V3 Free | |
# 5. Groq Llama 4 Scout (Moved to Last) | |
# --- MODIFIED LINE: Changed default model name --- | |
GEMINI_FLASH_PREVIEW_MODEL = os.environ.get("GEMINI_FLASH_PREVIEW_MODEL", "gemini-2.5-flash-preview-04-17") # NEW Model with date | |
# --- END MODIFIED LINE --- | |
GEMINI_PRO_EXP_MODEL = os.environ.get("GEMINI_PRO_EXP_MODEL", "gemini-2.5-pro-exp-03-25") | |
GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001") # Original Flash model | |
OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Specific DeepSeek model | |
GROQ_LLAMA4_MODEL = os.environ.get("GROQ_LLAMA4_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct") # Specific Llama 4 model (Now Last) | |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "1s7eXiaukVuOr4Ueg") # YT Default | |
APIFY_STRUCTURED_YT_ACTOR_ID = "gpjTCWkGZS1lHc9pR" # YT Fallback 1 (New Structured Extractor) | |
APIFY_CRAWLER_ACTOR_ID = "aYG0l9s7dbB7j3gbS" # Scrape Fallback 4 | |
APIFY_TEXT_SCRAPER_ACTOR_ID = "2gbQiRSpJIIag2FdR" # Scrape Fallback 5 | |
# --- Configuration Checks --- | |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.") | |
# Summarizer Availability Checks | |
_groq_enabled = _groq_sdk_available and bool(GROQ_API_KEY) | |
_gemini_api_enabled = _gemini_sdk_available and bool(GEMINI_API_KEY) | |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY) | |
# Check for Gemini first now as it's the primary | |
if not _gemini_api_enabled: | |
if not _gemini_sdk_available: logger.error("❌ ERROR: google-generativeai library missing. Primary summarization (Gemini) disabled.") | |
elif not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.") | |
if not _openrouter_fallback_enabled: | |
logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Mid-tier fallback summarization (DeepSeek) will fail.") | |
if not _groq_enabled: | |
if not _groq_sdk_available: logger.warning("⚠️ WARNING: groq library missing. Groq (Llama 4) disabled.") | |
elif not GROQ_API_KEY: logger.warning("⚠️ WARNING: GROQ_API_KEY not found. Final fallback summarization (Groq Llama 4) will fail.") | |
if not _gemini_api_enabled and not _openrouter_fallback_enabled and not _groq_enabled: | |
logger.critical("❌ FATAL: No summarization models are configured or enabled. Bot cannot function.") | |
elif not _gemini_api_enabled: | |
logger.warning("⚠️ Primary summarizers (Gemini) are disabled. Will attempt DeepSeek via OpenRouter.") | |
# Scraper Availability Checks (Warnings only) | |
if not RAPIDAPI_KEY: logger.warning("⚠️ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks (2 & 3) will be unavailable.") | |
if not APIFY_API_TOKEN: logger.warning("⚠️ WARNING: APIFY_API_TOKEN not found. YT transcript fallback (2) and Website scraping fallbacks (4 & 5) will be unavailable.") | |
if not URLTOTEXT_API_KEY: logger.warning("Optional secret 'URLTOTEXT_API_KEY' not found. Web scraping fallback 1 unavailable.") | |
if not SUPADATA_API_KEY: logger.warning("Optional secret 'SUPADATA_API_KEY' not found. YT transcript fallback 1 unavailable.") | |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.") | |
logger.info("Secret loading and configuration check finished.") | |
# --- Log summarizers in the NEW order --- | |
logger.info(f"Summarizer 1 (Gemini Flash Preview): {GEMINI_FLASH_PREVIEW_MODEL if _gemini_api_enabled else 'DISABLED'} (Using specific date variant)") # Added note | |
logger.info(f"Summarizer 2 (Gemini Pro Exp): {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}") | |
logger.info(f"Summarizer 3 (Gemini Flash): {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}") | |
logger.info(f"Summarizer 4 (OpenRouter): {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}") | |
logger.info(f"Summarizer 5 (Groq): {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'}") | |
# --- End Summarizer Logging --- | |
logger.info(f"Using Apify Actor (YT Fallback 1 - Structured): {APIFY_STRUCTURED_YT_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (YT Fallback 3 - Default): {APIFY_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}") | |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}") | |
# --- End Updated Logging --- | |
# Flags for scraper key existence | |
_apify_token_exists = bool(APIFY_API_TOKEN) | |
_urltotext_key_exists = bool(URLTOTEXT_API_KEY) | |
_rapidapi_key_exists = bool(RAPIDAPI_KEY) | |
# --- Configure APIs --- | |
if _gemini_api_enabled: | |
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.") | |
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_api_enabled = False | |
# Groq client is initialized per-request in the _call_groq function | |
# --- Retry Decorator --- | |
async def retry_bot_operation(func, *args, **kwargs): | |
try: return await func(*args, **kwargs) | |
except BadRequest as e: | |
ignore_errors = [ "message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user", ] | |
if any(err in str(e).lower() for err in ignore_errors): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None | |
logger.error(f"Potentially critical BadRequest: {e}"); raise | |
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise | |
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise | |
# --- Helper Functions --- | |
def is_youtube_url(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match) | |
def extract_youtube_id(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url) | |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id | |
else: logger.warning(f"Could not extract YT ID from {url}"); return None | |
# --- Content Fetching Functions --- | |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]: | |
"""Fallback YT 3: Fetches YouTube transcript using Supadata API.""" # <<< UPDATED DOCSTRING | |
if not video_id: logger.error("[Supadata] No video_id provided"); return None | |
if not api_key: logger.error("[Supadata] API key missing."); return None | |
# <<< UPDATED LOG MESSAGE NUMBER >>> | |
logger.info(f"[YT Fallback 3] Attempting fetch for video ID: {video_id} via Supadata") | |
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript" | |
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key} | |
try: | |
async with httpx.AsyncClient(timeout=30.0) as client: | |
response = await client.get(api_endpoint, headers=headers, params=params) | |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}") | |
if response.status_code == 200: | |
try: | |
data = response.json() if response.text else None | |
content = None | |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data") | |
if not content and response.text: content = response.text | |
if content and isinstance(content, str): | |
# <<< UPDATED LOG MESSAGE NUMBER >>> | |
logger.info(f"[Supadata] Success (Fallback 3) for {video_id}. Length: {len(content)}"); | |
return content.strip() | |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None | |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None | |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None | |
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None | |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None | |
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None | |
except httpx.RequestError as e: | |
if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}") | |
else: logger.error(f"[Supadata] Request error for {video_id}: {e}") | |
return None | |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None | |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]: | |
"""Fallback YT 1: Fetches YouTube transcript using default Apify Actor via generic function.""" # <<< UPDATED DOCSTRING & NUMBER | |
global APIFY_ACTOR_ID | |
# The specific run_input logic is now handled within _run_apify_actor_for_web_content | |
# when it detects the actor_id matches APIFY_ACTOR_ID | |
# <<< UPDATED LOG MESSAGE >>> | |
logger.debug(f"[get_transcript_via_apify - Fallback 1] Calling generic runner for URL: {video_url}") | |
return await _run_apify_actor_for_web_content( | |
url=video_url, # Pass video_url as the 'url' parameter | |
api_token=api_token, | |
actor_id=APIFY_ACTOR_ID, | |
# <<< UPDATED ACTOR NAME IN LOGS >>> | |
actor_name="Apify YT Default (Fallback 1)" | |
) | |
async def get_transcript_via_apify_structured_extractor(video_url: str, api_token: str) -> Optional[str]: | |
"""Fallback YT 2: Fetches YouTube transcript using the Structured Extractor Apify Actor.""" # <<< UPDATED DOCSTRING & NUMBER | |
global APIFY_STRUCTURED_YT_ACTOR_ID # Use the new ID | |
if not video_url: logger.error("[Apify Structured YT] No video_url provided"); return None | |
if not api_token: logger.error("[Apify Structured YT] API token missing."); return None | |
# <<< UPDATED LOG MESSAGE NUMBER >>> | |
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_STRUCTURED_YT_ACTOR_ID})") | |
# Use the generic helper function. | |
# We assume the standard input format used by the helper for non-specific actors | |
# (like the text scraper: {"urls": [url]}) is sufficient, or that the helper's | |
# existing logic correctly identifies and parses the output from this actor. | |
# The helper already has logic to parse various text/content/captions fields, | |
# including list structures for captions as described in the actor's docs. | |
return await _run_apify_actor_for_web_content( | |
url=video_url, | |
api_token=api_token, | |
actor_id=APIFY_STRUCTURED_YT_ACTOR_ID, | |
# <<< UPDATED ACTOR NAME IN LOGS >>> | |
actor_name="Apify Structured YT (Fallback 2)" # Specific name for logging | |
) | |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]: | |
""" | |
Fetches YouTube transcript using multiple fallback methods in the specified order: | |
1. Apify Default Actor (1s7eXiaukVuOr4Ueg) | |
2. Apify Structured Actor (gpjTCWkGZS1lHc9pR) | |
3. Supadata API | |
""" | |
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists | |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None | |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url}) - NEW Fallback Order") | |
transcript_text: Optional[str] = None | |
# --- Primary Method: REMOVED (youtube-transcript-api) --- | |
# logger.info("[Primary YT] Attempting youtube-transcript-api...") # Removed | |
# --- Fallback 1: Apify Default YT Actor (1s7eXiaukVuOr4Ueg) --- | |
if transcript_text is None: | |
logger.info("[Fallback YT 1] Trying Apify REST API (Default YT Actor)...") # <<<< NEW Fallback 1 | |
if _apify_token_exists: | |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN) | |
if transcript_text: | |
logger.info(f"[Fallback YT 1] Success via Apify Default YT Actor for {video_url}") # <<<< UPDATED NUMBER | |
return transcript_text # Return on success | |
else: | |
logger.warning(f"[Fallback YT 1] Apify Default YT Actor failed or no content for {video_url}.") # <<<< UPDATED NUMBER | |
else: | |
logger.warning("[Fallback YT 1] Apify API token unavailable. Skipping Default YT Actor.") # <<<< UPDATED NUMBER | |
# --- Fallback 2: Apify Structured Transcript Extractor (gpjTCWkGZS1lHc9pR) --- | |
if transcript_text is None: | |
logger.info("[Fallback YT 2] Trying Apify Structured Transcript Extractor...") # <<<< NEW Fallback 2 | |
if _apify_token_exists: | |
transcript_text = await get_transcript_via_apify_structured_extractor(video_url, APIFY_API_TOKEN) | |
if transcript_text: | |
logger.info(f"[Fallback YT 2] Success via Apify Structured Extractor for {video_url}") # <<<< UPDATED NUMBER | |
return transcript_text # Return on success | |
else: | |
logger.warning(f"[Fallback YT 2] Apify Structured Extractor failed or no content for {video_url}.") # <<<< UPDATED NUMBER | |
else: | |
logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping Apify Structured Extractor.") # <<<< UPDATED NUMBER | |
# --- Fallback 3: Supadata API --- | |
if transcript_text is None: | |
logger.info("[Fallback YT 3] Trying Supadata API...") # <<<< NEW Fallback 3 | |
if SUPADATA_API_KEY: | |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY) | |
if transcript_text: | |
logger.info(f"[Fallback YT 3] Success via Supadata for {video_id}") # <<<< UPDATED NUMBER | |
return transcript_text # Return on success | |
else: | |
logger.warning(f"[Fallback YT 3] Supadata failed or no content for {video_id}.") # <<<< UPDATED NUMBER | |
else: | |
logger.warning("[Fallback YT 3] Supadata API key unavailable. Skipping.") # <<<< UPDATED NUMBER | |
# --- Final Outcome --- | |
if transcript_text is None: | |
logger.error(f"All fallback methods failed for YT transcript: {video_id}") | |
return None # Explicitly return None if all failed | |
# This line should only be reached if a fallback succeeded but wasn't returned early (shouldn't happen). | |
return transcript_text | |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]: | |
"""Directly fetches URL content using httpx.""" | |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' } | |
try: | |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client: | |
logger.debug(f"[Web Scrape Direct] Sending GET request to {url}") | |
response = await client.get(url) | |
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}") | |
response.raise_for_status() | |
content_type = response.headers.get('content-type', '').lower() | |
if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None | |
try: return response.text | |
except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}") | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}") | |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}") | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}") | |
except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True) | |
return None | |
async def get_website_content(url: str) -> Optional[str]: | |
"""Primary method: Fetches HTML directly and parses with BeautifulSoup.""" | |
if not url: logger.error("[Web Scrape Primary] No URL provided"); return None | |
logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}") | |
html_content = await fetch_url_content_for_scrape(url) | |
if not html_content: logger.warning(f"[Web Scrape Primary] Direct fetch failed for {url}."); return None | |
try: | |
def parse_html(content: str) -> Optional[str]: | |
try: | |
soup = BeautifulSoup(content, DEFAULT_PARSER) | |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]): element.extract() | |
main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post', re.I)) | |
target_element = main_content if main_content else soup.body | |
if not target_element: logger.warning(f"[Web Scrape Primary Parse] Could not find body or main content container for {url}"); return None | |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()] | |
text = " ".join(lines) | |
if not text or len(text) < 50: logger.warning(f"[Web Scrape Primary Parse] Extracted text seems too short or empty after cleaning for {url}. Length: {len(text)}"); return None | |
return text | |
except Exception as parse_e: logger.error(f"[Web Scrape Primary Parse] Error during BeautifulSoup parsing for {url}: {parse_e}", exc_info=False); return None | |
text_content = await asyncio.to_thread(parse_html, html_content) | |
if text_content: logger.info(f"[Web Scrape Primary] Success via direct fetch & parse for {url} (final len: {len(text_content)})"); return text_content | |
else: logger.warning(f"[Web Scrape Primary] Parsing failed or yielded no content for {url}."); return None | |
except Exception as e: logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 1: Fetches website content using urltotext.com API.""" | |
if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None | |
logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API") | |
api_endpoint = "https://urltotext.com/api/v1/urltotext/" | |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False } | |
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" } | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[Web Scrape Fallback 1] Sending request to urltotext.com API for {url}") | |
response = await client.post(api_endpoint, headers=headers, json=payload) | |
logger.debug(f"[Web Scrape Fallback 1] Received status {response.status_code} from urltotext.com API for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning") | |
if warning: logger.warning(f"[Web Scrape Fallback 1] urltotext.com API Warning for {url}: {warning}") | |
if content and isinstance(content, str): logger.info(f"[Web Scrape Fallback 1] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip() | |
else: logger.warning(f"[Web Scrape Fallback 1] urltotext.com API success but content empty for {url}. Resp: {data}"); return None | |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 1] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Web Scrape Fallback 1] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 1] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout connecting to urltotext.com API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 1] Request error connecting to urltotext.com API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 2: Fetches website content using Scraper's Proxy Standard endpoint via RapidAPI (Updated).""" | |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Standard API (POST)") | |
api_host = "scrapers-proxy2.p.rapidapi.com" | |
# --- Updated Endpoint Construction --- | |
# Uses POST method and /standard path | |
# Query parameters: url and content_type=application/json | |
encoded_url = urllib.parse.quote(url, safe='') | |
content_type_param = urllib.parse.quote('application/json', safe='') | |
api_endpoint = f"https://{api_host}/standard?url={encoded_url}&content_type={content_type_param}" | |
# --- Updated Headers --- | |
headers = { | |
"Content-Type": "application/json", # Added as per cURL example | |
"x-rapidapi-host": api_host, | |
"x-rapidapi-key": api_key, | |
# 'accept-encoding': 'gzip' removed, httpx handles this automatically | |
} | |
# --- Define Payload (empty JSON object as per cURL example, might not be strictly needed but safer) --- | |
payload = {} # Use {} instead of the example string for a generic POST | |
try: | |
async with httpx.AsyncClient(timeout=40.0) as client: | |
logger.debug(f"[Web Scrape Fallback 2] Sending POST request to {api_host}/standard for {url}") # Changed to POST | |
# --- Use client.post with json payload --- | |
response = await client.post(api_endpoint, headers=headers, json=payload) | |
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}") | |
# --- Response Handling (Kept similar parsing, adjust if /standard format differs) --- | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
# Assuming /standard endpoint might still have 'content' and 'title' or similar structure | |
content = data.get("content") or data.get("text") # Added .get("text") as a potential alternative | |
title = data.get("title") | |
extracted_text = ""; | |
if title and isinstance(title, str): extracted_text += title.strip() + ". " | |
if content and isinstance(content, str): extracted_text += content.strip() | |
if extracted_text and len(extracted_text) > 30: | |
logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Standard API for {url}. Len: {len(extracted_text)}"); return extracted_text | |
else: | |
# Log if parsing failed even on 200 | |
keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else "Non-dict response" | |
logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy Standard API success (200) but content/title seems empty or too short for {url}. {keys_info}. Length: {len(extracted_text)}"); return None | |
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy Standard API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy Standard API success response for {url}: {e}", exc_info=True); return None | |
# --- Error Handling (remains the same) --- | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 2] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 2] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 2] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 2] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]: | |
"""Fallback 3: Fetches website content using AI Web Scraper via RapidAPI.""" | |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None | |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None | |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API") | |
api_host = "ai-web-scraper.p.rapidapi.com" | |
api_endpoint = f"https://{api_host}/extract_content/v1" | |
headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'x-rapidapi-host': api_host, 'x-rapidapi-key': api_key } | |
payload = {'url': url} | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[Web Scrape Fallback 3] Sending POST request to {api_host} for {url}") | |
response = await client.post(api_endpoint, headers=headers, data=payload) | |
logger.debug(f"[Web Scrape Fallback 3] Received status {response.status_code} from {api_host} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json(); content = None | |
if isinstance(data, dict): content = data.get("content") or data.get("text") or data.get("extracted_text") or data.get("result") | |
elif isinstance(data, str): content = data | |
if content and isinstance(content, str) and len(content) > 30: logger.info(f"[Web Scrape Fallback 3] Success via AI Web Scraper API for {url}. Len: {len(content)}"); return content.strip() | |
else: | |
keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else f"Type: {type(data)}"; content_len = len(content) if content and isinstance(content, str) else 0 | |
logger.warning(f"[Web Scrape Fallback 3] AI Web Scraper API success but content empty/short/invalid format for {url}. {keys_info}. Length: {content_len}"); return None | |
except json.JSONDecodeError: | |
raw_text = response.text | |
if raw_text and len(raw_text) > 30: logger.warning(f"[Web Scrape Fallback 3] Failed JSON decode for AI Web Scraper, but found raw text content. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}"); return raw_text.strip() | |
else: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{raw_text[:500]}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None | |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check API subscription/limits."); return None | |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None | |
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 3] Timeout connecting to {api_host} API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 3] Request error connecting to {api_host} API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None | |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]: | |
"""Generic function to run an Apify actor and get text content.""" | |
if not url: logger.error(f"[{actor_name}] No URL provided"); return None | |
if not api_token: logger.error(f"[{actor_name}] API token missing."); return None | |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items" | |
params = {"token": api_token} | |
log_prefix = f"[{actor_name}]" # Use actor_name for logging prefix | |
# --- Define input based on actor ID --- | |
run_input: Dict[str, Any] = {} # Initialize empty dict | |
if actor_id == APIFY_ACTOR_ID: | |
# Input specific to the default YT actor (1s7eXiaukVuOr4Ueg) - Requires LIST | |
run_input = { | |
"urls": [url], # <<< LIST format needed here | |
"maxRetries": 5, | |
"channelHandleBoolean": False, | |
"channelNameBoolean": False, | |
"datePublishedBoolean": False, | |
"relativeDateTextBoolean": False | |
} | |
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})") | |
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>> | |
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING | |
# Based on the error message "Field input.urls must be string" | |
run_input = { | |
"urls": url # <<< STRING format needed here, not list | |
} | |
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})") | |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID: | |
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST | |
run_input = { | |
"urls": [url] # <<< Assume LIST format standard here | |
} | |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})") | |
elif actor_id == APIFY_CRAWLER_ACTOR_ID: | |
# Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS) - Uses startUrls | |
run_input = { | |
"startUrls": [{"url": url}], # <<< Different structure entirely | |
"maxCrawlPages": 1, | |
"crawlerType": "playwright:firefox" # Or adjust as needed | |
} | |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})") | |
else: | |
# Fallback default input if actor ID doesn't match known ones | |
# Using the simple {"urls": [url]} format seems safest for generic text/content extractors | |
run_input = {"urls": [url]} # <<< Default to LIST | |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}") | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=120.0) as client: | |
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity) | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input) | |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}") | |
# --- Start of response processing (Remains the same as before) --- | |
if response.status_code in [200, 201]: | |
if response.status_code == 201: | |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.") | |
try: | |
results = response.json(); content = None | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0] | |
# Optional: Re-enable for deep debugging if needed | |
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}") | |
content = None # Reset content | |
# --- REFINED PARSING LOGIC (Handles output from various actors) --- | |
if "text" in item and isinstance(item["text"], str): | |
logger.info(f"{log_prefix} Found text content in 'text' field.") | |
content = item["text"] | |
elif "content" in item and isinstance(item["content"], str): | |
logger.info(f"{log_prefix} Found text content in 'content' field.") | |
content = item["content"] | |
elif "markdown" in item and isinstance(item["markdown"], str): | |
logger.info(f"{log_prefix} Found text content in 'markdown' field.") | |
content = item["markdown"] | |
elif "captions" in item and isinstance(item["captions"], str): | |
# This case might still happen if the actor *sometimes* returns string | |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).") | |
content = item["captions"] | |
# --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) --- | |
elif "captions" in item and isinstance(item["captions"], list): | |
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...") | |
transcript_parts = [] | |
if not item["captions"]: # Handle empty list case | |
logger.warning(f"{log_prefix} 'captions' field is an empty list.") | |
else: | |
# Check the type of the *first* element to decide parsing strategy | |
first_element = item["captions"][0] | |
if isinstance(first_element, str): | |
# Assume list of strings (Example 1 in docs) | |
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.") | |
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)] | |
elif isinstance(first_element, dict) and "text" in first_element: | |
# Assume list of dictionaries (Example 2 in docs) | |
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.") | |
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg] | |
else: | |
logger.warning(f"{log_prefix} 'captions' list contains unexpected element types (first element type: {type(first_element)}). Cannot parse.") | |
if transcript_parts: | |
content = " ".join(transcript_parts).strip() | |
logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}") | |
else: | |
logger.warning(f"{log_prefix} Could not extract usable text from 'captions' list structure.") | |
# --- END MODIFIED LIST HANDLING --- | |
elif "html" in item and isinstance(item["html"], str): | |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.") | |
def parse_html_sync(html_str): | |
try: | |
soup = BeautifulSoup(html_str, DEFAULT_PARSER) | |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip()) | |
except Exception as e: | |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}") | |
return None | |
content = await asyncio.to_thread(parse_html_sync, item["html"]) | |
# --- FINAL CONTENT CHECK --- | |
if content and isinstance(content, str) and len(content) > 30: | |
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}") | |
return content.strip() | |
else: | |
# Log failure after trying all parsing methods | |
content_len = len(content) if content and isinstance(content, str) else 0 | |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A" | |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}") | |
return None # Return None if no valid content found | |
else: | |
# Handle empty dataset list '[]' or non-list response | |
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}") | |
return None | |
# --- End of success processing logic --- | |
except json.JSONDecodeError: | |
# Check if the raw text looks like a transcript if JSON fails | |
raw_text = response.text | |
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content | |
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}") | |
return raw_text.strip() | |
else: | |
logger.error(f"{log_prefix} Failed JSON decode and no usable raw text found. Status:{response.status_code}. Resp:{raw_text[:200]}"); | |
return None | |
except Exception as e: | |
logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True); | |
return None | |
# Error handling for API call itself | |
elif response.status_code == 400: | |
# Log the specific error message from the API response if available | |
error_msg = response.text[:200] # Default | |
try: error_msg = response.json().get("error", {}).get("message", response.text[:200]) | |
except Exception: pass | |
logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. API Msg: '{error_msg}'"); return None | |
elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None | |
elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None | |
else: | |
logger.error(f"{log_prefix} Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}"); | |
return None | |
# Error handling for network/client issues | |
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice | |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None | |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None | |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]: | |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function.""" | |
global APIFY_ACTOR_ID | |
# The specific run_input logic is now handled within _run_apify_actor_for_web_content | |
# when it detects the actor_id matches APIFY_ACTOR_ID | |
logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}") | |
return await _run_apify_actor_for_web_content( | |
url=video_url, # Pass video_url as the 'url' parameter | |
api_token=api_token, | |
actor_id=APIFY_ACTOR_ID, | |
actor_name="Apify YT" # Keep specific name for logging clarity | |
) | |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 4: Fetches website content using Apify Website Content Crawler.""" | |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, actor_name="Apify Crawler" ) | |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]: | |
"""Fallback 5: Fetches website content using Apify Text Scraper Free.""" | |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, actor_name="Apify Text Scraper" ) | |
# --- Summarization Functions (Using Specific April 2025 Models) --- | |
# --- Prompts (Defined once, used by all models) --- | |
PROMPT_PARAGRAPH = ( | |
"You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:" | |
) | |
PROMPT_POINTS = ( | |
"You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" | |
"Here is the text to summarise:" | |
) | |
async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call Groq API (Primary - Llama 4 Scout). Returns (summary, error_message).""" | |
global GROQ_API_KEY, GROQ_LLAMA4_MODEL, _groq_enabled | |
if not _groq_enabled: | |
logger.error("[Groq Primary] Called but is disabled."); | |
return None, f"Error: Primary AI service (Groq {GROQ_LLAMA4_MODEL}) not configured/available." | |
logger.info(f"[Groq Primary] Generating {summary_type} summary using {GROQ_LLAMA4_MODEL}. Input length: {len(text)}") | |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS | |
MAX_INPUT_LENGTH_GROQ = 40000 # ~13k tokens for 16k context | |
if len(text) > MAX_INPUT_LENGTH_GROQ: | |
logger.warning(f"[Groq Primary] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GROQ}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_GROQ] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
try: | |
groq_client = Groq( api_key=GROQ_API_KEY, timeout=httpx.Timeout(120.0, connect=10.0) ) | |
logger.info(f"[Groq Primary] Sending request to Groq ({GROQ_LLAMA4_MODEL})...") | |
# FIX: Removed await from the synchronous call | |
chat_completion = groq_client.chat.completions.create( | |
messages=[ { "role": "user", "content": full_prompt } ], | |
model=GROQ_LLAMA4_MODEL, | |
temperature=0.7, # Adjust from Groq default of 1 if needed | |
max_tokens=2048, # Adjust from Groq default of 1024 if needed | |
top_p=1, stream=False, stop=None, | |
) | |
logger.info("[Groq Primary] Received response from Groq.") | |
if chat_completion.choices and chat_completion.choices[0].message and chat_completion.choices[0].message.content: | |
summary = chat_completion.choices[0].message.content | |
finish_reason = chat_completion.choices[0].finish_reason | |
logger.info(f"[Groq Primary] Success generating summary. Finish Reason: {finish_reason}. Output len: {len(summary)}"); | |
return summary.strip(), None | |
else: | |
logger.warning(f"[Groq Primary] Groq response structure unexpected or content empty. Response: {chat_completion.model_dump_json(indent=2)}") | |
finish_reason = chat_completion.choices[0].finish_reason if chat_completion.choices else 'N/A' | |
return None, f"Sorry, the primary AI model ({GROQ_LLAMA4_MODEL}) provided an empty or invalid response (Finish Reason: {finish_reason})." | |
except GroqError as ge: | |
logger.error(f"[Groq Primary] Groq API error: {ge.status_code} - {ge.message}", exc_info=False) | |
error_msg = f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) failed. API Error: {ge.status_code}." | |
if ge.status_code == 401: error_msg = f"Error: Primary AI service (Groq {GROQ_LLAMA4_MODEL}) API key is invalid." | |
elif ge.status_code == 429: error_msg = f"Sorry, primary AI model ({GROQ_LLAMA4_MODEL}) is busy (Rate Limit). Try again." | |
return None, error_msg | |
except httpx.TimeoutException as te: | |
logger.error(f"[Groq Primary] Timeout during Groq API call: {te}") | |
return None, f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) timed out." | |
except httpx.RequestError as re: | |
logger.error(f"[Groq Primary] Network error during Groq API call: {re}") | |
return None, f"Sorry, couldn't connect to the primary AI service ({GROQ_LLAMA4_MODEL})." | |
except Exception as e: | |
logger.error(f"[Groq Primary] Unexpected error during Groq API call: {e}", exc_info=True); | |
# Don't expose internal TypeError potentially, give a generic message | |
# Check if the specific error is the TypeError we saw earlier. | |
if isinstance(e, TypeError) and "can't be used in 'await' expression" in str(e): | |
# This case *shouldn't* happen now await is removed, but good to handle defensively. | |
logger.error("[Groq Primary] Encountered unexpected await TypeError again.") | |
return None, f"Sorry, an internal configuration error occurred with the primary AI service ({GROQ_LLAMA4_MODEL})." | |
return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GROQ_LLAMA4_MODEL})." | |
async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call Gemini API. Returns (summary, error_message).""" | |
# Make sure globals are accessible if needed (import should suffice) | |
global _gemini_api_enabled, HarmCategory, HarmBlockThreshold, genai, google | |
if not _gemini_api_enabled: | |
logger.error(f"[Gemini {model_name}] Called but API is disabled."); | |
return None, f"Error: AI service (Gemini API) not configured/available." | |
# Check if SDK and necessary types are loaded | |
if genai is None or HarmCategory is None or HarmBlockThreshold is None: | |
logger.error(f"[Gemini {model_name}] SDK or safety types (HarmCategory/HarmBlockThreshold) are None/unavailable.") | |
return None, f"Sorry, an internal configuration error occurred with the AI service ({model_name}). SDK components missing." | |
logger.info(f"[Gemini {model_name}] Generating {summary_type} summary using {model_name}. Input length: {len(text)}") | |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS | |
MAX_INPUT_LENGTH_GEMINI = 900000 | |
if 'pro-exp' in model_name.lower(): MAX_INPUT_LENGTH_GEMINI = 1800000 | |
if len(text) > MAX_INPUT_LENGTH_GEMINI: | |
logger.warning(f"[Gemini {model_name}] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
# Define safety_settings | |
safety_settings = {} | |
try: | |
# Use only categories known to exist in the SDK | |
safety_settings = { | |
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, | |
} | |
settings_applied_str = ", ".join([k.name for k in safety_settings.keys()]) | |
logger.debug(f"[Gemini {model_name}] Applying BLOCK_NONE to available safety categories: [{settings_applied_str}]") | |
except (NameError, AttributeError) as e: | |
logger.error(f"[Gemini {model_name}] Unexpected error defining safety settings ({type(e).__name__}): {e}.", exc_info=True) | |
return None, f"Sorry, an internal error occurred configuring the AI service ({model_name}). Safety settings definition failed." | |
if not safety_settings: | |
logger.error(f"[Gemini {model_name}] Failed to define any safety settings.") | |
return None, f"Sorry, an internal error occurred configuring the AI service ({model_name}). No safety settings defined." | |
# --- API Call --- | |
try: | |
model = genai.GenerativeModel(model_name) | |
logger.info(f"[Gemini {model_name}] Sending request to Gemini ({model_name})...") | |
request_options = {"timeout": 120} | |
response = await model.generate_content_async( | |
full_prompt, generation_config=genai.types.GenerationConfig(), | |
safety_settings=safety_settings, request_options=request_options ) | |
logger.info(f"[Gemini {model_name}] Received response from Gemini.") | |
# --- Response Handling (same as before) --- | |
if response.prompt_feedback and response.prompt_feedback.block_reason: | |
block_reason_str = getattr(response.prompt_feedback.block_reason, 'name', str(response.prompt_feedback.block_reason)) | |
logger.warning(f"[Gemini {model_name}] Request blocked by API. Reason: {block_reason_str}"); | |
return None, f"Sorry, the AI model ({model_name}) blocked the request (Reason: {block_reason_str})." | |
summary = None; finish_reason_str = 'UNKNOWN' | |
if response.candidates: | |
candidate = response.candidates[0] | |
finish_reason_name = getattr(candidate.finish_reason, 'name', None) | |
finish_reason_str = finish_reason_name or 'N/A' | |
if finish_reason_name == 'SAFETY': | |
safety_ratings_str = "N/A" | |
if hasattr(candidate, 'safety_ratings'): | |
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in candidate.safety_ratings]) | |
logger.warning(f"[Gemini {model_name}] Candidate blocked due to SAFETY. Finish Reason: {finish_reason_str}. Ratings: [{safety_ratings_str}]") | |
return None, f"Sorry, the AI model ({model_name}) generated a response that was blocked due to safety filters ({finish_reason_str})." | |
elif finish_reason_name not in ['STOP', 'MAX_TOKENS', None]: | |
logger.warning(f"[Gemini {model_name}] Candidate finished with non-standard reason: {finish_reason_str}") | |
if candidate.content and candidate.content.parts: | |
summary = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text')) | |
if summary is None: | |
try: | |
if hasattr(response, 'text'): summary = response.text | |
else: logger.warning(f"[Gemini {model_name}] Response object lacks 'text' attribute."); summary = None | |
except ValueError as e: logger.warning(f"[Gemini {model_name}] Error accessing response.text (likely blocked/no content): {e}"); summary = None | |
except Exception as e: logger.warning(f"[Gemini {model_name}] Unexpected error accessing response.text: {e}"); summary = None | |
if summary: | |
logger.info(f"[Gemini {model_name}] Success generating summary. Finish Reason: {finish_reason_str}. Output len: {len(summary)}"); | |
return summary.strip(), None | |
else: | |
logger.warning(f"[Gemini {model_name}] Gemini returned empty summary or content was blocked. Final Finish Reason: {finish_reason_str}"); | |
if finish_reason_str == 'SAFETY': return None, f"Sorry, the AI model ({model_name}) response was blocked by safety filters." | |
return None, f"Sorry, the AI model ({model_name}) did not provide a summary (Finish Reason: {finish_reason_str})." | |
# --- Exception Handling (Corrected Syntax) --- | |
except AttributeError as ae: | |
logger.error(f"[Gemini {model_name}] AttributeError during Gemini response processing: {ae}. SDK/response structure issue.", exc_info=True); | |
return None, f"Sorry, there was an issue processing the response from the AI service ({model_name}). Attribute error." | |
# --- Specific Google API Error Handling (Only if google.api_core.exceptions was imported) --- | |
# Attempt to catch specific errors if the 'google' object and submodules are available | |
except google.api_core.exceptions.NotFound as nfe: | |
logger.error(f"[Gemini {model_name}] Model Not Found error from Gemini API: {nfe}", exc_info=False) | |
user_message = f"Sorry, the AI model '{model_name}' was not found by the API service. It might be unavailable or spelled incorrectly." | |
return None, user_message | |
except google.api_core.exceptions.InvalidArgument as iae: | |
logger.error(f"[Gemini {model_name}] Invalid Argument error from Gemini API: {iae}", exc_info=False) | |
error_detail = str(iae) | |
user_message = f"Sorry, the AI service ({model_name}) reported an invalid argument." | |
if "API key not valid" in error_detail: user_message = f"Error: The API key for the AI service ({model_name}) is invalid." | |
return None, user_message | |
except google.api_core.exceptions.PermissionDenied as pde: | |
logger.error(f"[Gemini {model_name}] Permission Denied error from Gemini API: {pde}", exc_info=False) | |
user_message = f"Error: Access denied for the AI service ({model_name}). Check API key permissions." | |
return None, user_message | |
except google.api_core.exceptions.ResourceExhausted as ree: | |
logger.error(f"[Gemini {model_name}] Resource Exhausted (Quota/Rate Limit) error from Gemini API: {ree}", exc_info=False) | |
user_message = f"Sorry, the AI model ({model_name}) is busy or quota exceeded. Please try again later." | |
return None, user_message | |
except google.api_core.exceptions.GoogleAPIError as gae: | |
logger.error(f"[Gemini {model_name}] Google API error during Gemini call: {gae}", exc_info=False) | |
status_code = getattr(gae, 'code', 'Unknown') | |
user_message = f"Sorry, the AI service ({model_name}) encountered an API error (Code: {status_code})." | |
if status_code == 500: user_message = f"Sorry, the AI service ({model_name}) had an internal server error." | |
return None, user_message | |
# --- General Exception Catch-all --- | |
except Exception as e: | |
# Check if 'google' was defined during import, if not, specific catches above would have failed with NameError | |
google_exceptions_available = 'google' in globals() and google and hasattr(google, 'api_core') and hasattr(google.api_core, 'exceptions') | |
# Log the error appropriately | |
if isinstance(e, NameError) and 'google' in str(e) and not google_exceptions_available: | |
# This specifically means the import failed and we tried to use a google.api_core exception type | |
logger.error(f"[Gemini {model_name}] Failed to import google.api_core.exceptions. Cannot catch specific API errors. Fallback error: {e}", exc_info=True); | |
error_msg = f"Sorry, an internal configuration error occurred with the AI service ({model_name}). Cannot handle specific API errors." | |
else: | |
# General unexpected error | |
logger.error(f"[Gemini {model_name}] Unexpected error during Gemini API call: {e}", exc_info=True); | |
error_msg = f"Sorry, an unexpected error occurred while using the AI service ({model_name})." | |
return None, error_msg | |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call OpenRouter API (Final Fallback - DeepSeek V3 Free). Returns (summary, error_message).""" | |
global OPENROUTER_API_KEY, OPENROUTER_DEEPSEEK_MODEL, _openrouter_fallback_enabled | |
if not _openrouter_fallback_enabled: | |
logger.error("[OpenRouter Fallback] Called but is disabled."); | |
return None, "Error: Final fallback AI service (OpenRouter) not configured/available." | |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_DEEPSEEK_MODEL}. Input length: {len(text)}") | |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS | |
MAX_INPUT_LENGTH_OR = 100000 # DeepSeek V3 has 131k context, 100k chars is safe | |
if len(text) > MAX_INPUT_LENGTH_OR: | |
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}) for {OPENROUTER_DEEPSEEK_MODEL}. Truncating."); | |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
headers = { | |
"Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
"Content-Type": "application/json", | |
"HTTP-Referer": os.environ.get("YOUR_SITE_URL", "https://github.com/your-repo"), | |
"X-Title": os.environ.get("YOUR_SITE_NAME", "TelegramSummariserBot") | |
} | |
payload = { "model": OPENROUTER_DEEPSEEK_MODEL, "messages": [{"role": "user", "content": full_prompt}], } | |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions" | |
api_timeouts = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=60.0) | |
response = None | |
try: | |
async with httpx.AsyncClient(timeout=api_timeouts) as client: | |
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}) with read timeout {api_timeouts.read}s...") | |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload) | |
logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0: | |
choice = data["choices"][0]; message = choice.get("message"); finish_reason = choice.get("finish_reason", "N/A") | |
if message and isinstance(message, dict): | |
summary = message.get("content") | |
if summary: logger.info(f"[OpenRouter Fallback] Success via OpenRouter. Finish: {finish_reason}. Output len: {len(summary)}"); return summary.strip(), None | |
else: logger.warning(f"[OpenRouter Fallback] OpenRouter success but content empty. Finish: {finish_reason}. Resp: {data}"); return None, f"Sorry, the fallback AI model ({OPENROUTER_DEEPSEEK_MODEL}) returned an empty summary (Finish: {finish_reason})." | |
else: logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Finish: {finish_reason}. Full: {data}"); return None, "Sorry, could not parse fallback AI response (message format)." | |
else: | |
error_details = data.get("error", {}); logger.error(f"[OpenRouter Fallback] Unexpected choices structure or error in response: {data.get('choices')}. Error: {error_details}. Full: {data}"); | |
return None, f"Sorry, could not parse fallback AI response (choices structure or error: {error_details.get('message', 'Unknown')})." | |
except json.JSONDecodeError: logger.error(f"[OpenRouter Fallback] Failed JSON decode OpenRouter. Status:{response.status_code}. Resp:{response.text[:500]}"); return None, "Sorry, failed to understand fallback AI response." | |
except Exception as e: logger.error(f"[OpenRouter Fallback] Error processing OpenRouter success response: {e}", exc_info=True); return None, "Sorry, error processing fallback AI response." | |
elif response.status_code == 401: logger.error("[OpenRouter Fallback] API key invalid (401)."); return None, f"Error: Fallback AI model ({OPENROUTER_DEEPSEEK_MODEL}) configuration key is invalid." | |
elif response.status_code == 402: logger.error("[OpenRouter Fallback] Payment Required/Quota Exceeded (402)."); return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) quota/limit issue." | |
elif response.status_code == 429: logger.warning("[OpenRouter Fallback] Rate Limit Exceeded (429)."); return None, f"Sorry, fallback AI model ({OPENROUTER_DEEPSEEK_MODEL}) is busy. Try again." | |
elif response.status_code == 500: logger.error(f"[OpenRouter Fallback] Internal Server Error (500). Resp:{response.text[:500]}"); return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) had an internal error." | |
else: | |
error_info = ""; | |
try: error_info = response.json().get("error", {}).get("message", "") | |
except Exception: pass | |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}"); | |
return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) returned unexpected status ({response.status_code})." | |
except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}"); return None, f"Sorry, the fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) timed out." | |
except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Sorry, there was an error connecting to the fallback AI model service." | |
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Sorry, an unexpected error occurred while using the fallback AI service." | |
async def generate_summary(text: str, summary_type: str) -> str: | |
""" | |
Generates summary using the specific model hierarchy (April 2025 - Updated): | |
1. Gemini 2.5 Flash Preview (NEW) | |
2. Gemini 2.5 Pro Exp | |
3. Gemini 2.0 Flash | |
4. OpenRouter (DeepSeek V3 Free) | |
5. Groq (Llama 4 Scout) | |
Returns the summary text or a comprehensive error message. | |
""" | |
global _gemini_api_enabled, _openrouter_fallback_enabled, _groq_enabled | |
global GEMINI_FLASH_PREVIEW_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL, GROQ_LLAMA4_MODEL | |
logger.info("[Summary Generation] Starting process with updated April 2025 model hierarchy.") | |
summary: Optional[str] = None | |
# Use more descriptive keys matching the model variables | |
errors: Dict[str, Optional[str]] = { | |
GEMINI_FLASH_PREVIEW_MODEL: None, | |
GEMINI_PRO_EXP_MODEL: None, | |
GEMINI_FLASH_MODEL: None, | |
OPENROUTER_DEEPSEEK_MODEL: None, | |
GROQ_LLAMA4_MODEL: None, | |
} | |
# --- Attempt 1: Gemini 2.5 Flash Preview (NEW) --- | |
if _gemini_api_enabled: | |
logger.info(f"[Summary Generation] Attempting 1: Gemini ({GEMINI_FLASH_PREVIEW_MODEL})") | |
summary, errors[GEMINI_FLASH_PREVIEW_MODEL] = await _call_gemini(text, summary_type, GEMINI_FLASH_PREVIEW_MODEL) | |
if summary: | |
logger.info(f"[Summary Generation] Success with Gemini ({GEMINI_FLASH_PREVIEW_MODEL}).") | |
return summary | |
else: | |
logger.warning(f"[Summary Generation] Gemini Flash Preview failed. Error: {errors[GEMINI_FLASH_PREVIEW_MODEL]}. Proceeding to Gemini 2.5 Pro Exp.") | |
else: | |
logger.warning("[Summary Generation] Gemini API is disabled or unavailable. Skipping all Gemini models.") | |
errors[GEMINI_FLASH_PREVIEW_MODEL] = "Service disabled/unavailable." | |
errors[GEMINI_PRO_EXP_MODEL] = "Service disabled/unavailable." | |
errors[GEMINI_FLASH_MODEL] = "Service disabled/unavailable." | |
# --- Attempt 2: Gemini 2.5 Pro Exp --- | |
# Only attempt if API is enabled AND the previous step didn't mark it as disabled | |
if _gemini_api_enabled and errors[GEMINI_PRO_EXP_MODEL] is None: | |
logger.info(f"[Summary Generation] Attempting 2: Gemini ({GEMINI_PRO_EXP_MODEL})") | |
summary, errors[GEMINI_PRO_EXP_MODEL] = await _call_gemini(text, summary_type, GEMINI_PRO_EXP_MODEL) | |
if summary: | |
logger.info(f"[Summary Generation] Success with Gemini ({GEMINI_PRO_EXP_MODEL}).") | |
return summary | |
else: | |
logger.warning(f"[Summary Generation] Gemini 2.5 Pro Exp failed. Error: {errors[GEMINI_PRO_EXP_MODEL]}. Proceeding to Gemini 2.0 Flash.") | |
# No separate 'else' needed here, handled by the initial Gemini API check | |
# --- Attempt 3: Gemini 2.0 Flash --- | |
# Only attempt if API is enabled AND it wasn't marked as disabled earlier | |
if _gemini_api_enabled and errors[GEMINI_FLASH_MODEL] is None: | |
logger.info(f"[Summary Generation] Attempting 3: Gemini ({GEMINI_FLASH_MODEL})") | |
summary, errors[GEMINI_FLASH_MODEL] = await _call_gemini(text, summary_type, GEMINI_FLASH_MODEL) | |
if summary: | |
logger.info(f"[Summary Generation] Success with Gemini ({GEMINI_FLASH_MODEL}).") | |
return summary | |
else: | |
logger.warning(f"[Summary Generation] Gemini 2.0 Flash failed. Error: {errors[GEMINI_FLASH_MODEL]}. Proceeding to OpenRouter DeepSeek V3.") | |
# No separate 'else' needed here | |
# --- Attempt 4: OpenRouter (DeepSeek V3 Free) --- | |
if _openrouter_fallback_enabled: | |
logger.info(f"[Summary Generation] Attempting 4: OpenRouter ({OPENROUTER_DEEPSEEK_MODEL})") | |
summary, errors[OPENROUTER_DEEPSEEK_MODEL] = await _call_openrouter(text, summary_type) | |
if summary: | |
logger.info(f"[Summary Generation] Success with OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}).") | |
return summary | |
else: | |
logger.warning(f"[Summary Generation] OpenRouter DeepSeek V3 failed. Error: {errors[OPENROUTER_DEEPSEEK_MODEL]}. Proceeding to Groq Llama 4 Scout.") | |
else: | |
logger.warning("[Summary Generation] OpenRouter fallback (DeepSeek V3) is disabled or unavailable. Skipping.") | |
errors[OPENROUTER_DEEPSEEK_MODEL] = "Service disabled/unavailable." | |
# --- Attempt 5: Groq (Llama 4 Scout - Final Fallback) --- | |
if _groq_enabled: | |
logger.info(f"[Summary Generation] Attempting 5: Groq ({GROQ_LLAMA4_MODEL})") | |
summary, errors[GROQ_LLAMA4_MODEL] = await _call_groq(text, summary_type) | |
if summary: | |
logger.info(f"[Summary Generation] Success with Groq ({GROQ_LLAMA4_MODEL}).") | |
return summary | |
else: | |
logger.error(f"[Summary Generation] Groq Llama 4 Scout (Final Fallback) also failed. Error: {errors[GROQ_LLAMA4_MODEL]}") | |
else: | |
logger.error("[Summary Generation] Groq (Llama 4 Scout) is disabled or unavailable. Cannot proceed.") | |
errors[GROQ_LLAMA4_MODEL] = "Service disabled/unavailable." | |
# --- All Attempts Failed --- | |
logger.error("[Summary Generation] All summarization models failed.") | |
# Use the model names as keys for clearer error reporting | |
error_details = "\n".join([f"- {model}: {err}" for model, err in errors.items() if err]) | |
return f"Sorry, I couldn't generate a summary after trying all available AI models.\nDetails:\n{error_details}" | |
# --- Main Processing Logic --- | |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None: | |
"""Handles the entire process: fetching content (with ALL fallbacks) and summarizing.""" | |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}") | |
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None | |
try: background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 ); bot = Bot(token=bot_token, request=background_request) | |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return | |
content: Optional[str] = None | |
user_feedback_message: Optional[str] = None | |
success: bool = False | |
status_message_id: Optional[int] = message_id_to_edit | |
message_to_delete_later_id : Optional[int] = None | |
try: | |
# --- 1. Initial User Feedback --- | |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (this might take a moment)..." | |
if status_message_id: | |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'") | |
except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None | |
if not status_message_id: | |
try: | |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN ) | |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}") | |
else: raise RuntimeError("Failed to send status message after retries.") | |
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise | |
try: | |
# --- 2. Content Fetching (Chain of methods) --- | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}") | |
if is_youtube: | |
video_id = extract_youtube_id(url) | |
if video_id: content = await get_youtube_transcript(video_id, url) | |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format." | |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)." | |
else: | |
# Website URL processing with NEW fallback order: 1 -> 5 -> 6 -> 3 -> 4 -> 2 | |
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN, _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists | |
# --- Method 1: Direct Fetch + BS4 (Primary) --- | |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 1 (Direct Fetch + BS4)..."); | |
content = await get_website_content(url) | |
# --- Method 5: Apify Crawler (Fallback 1) --- | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 5 (Apify Crawler)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_crawler(url, APIFY_API_TOKEN) | |
else: | |
logger.warning("[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.") | |
# --- Method 6: Apify Text Scraper (Fallback 2) --- | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...") | |
if _apify_token_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_apify_text_scraper(url, APIFY_API_TOKEN) | |
else: | |
logger.warning("[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.") | |
# --- Method 3: RapidAPI - Scraper's Proxy (Fallback 3) --- | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 6 failed. Trying Method 3 (Scraper's Proxy)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_scrapers_proxy(url, RAPIDAPI_KEY) | |
else: | |
logger.warning("[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.") | |
# --- Method 4: RapidAPI - AI Web Scraper (Fallback 4) --- | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...") | |
if _rapidapi_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_ai_web_scraper(url, RAPIDAPI_KEY) | |
else: | |
logger.warning("[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.") | |
# --- Method 2: urltotext.com API (Fallback 5) --- | |
if not content: | |
logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 2 (urltotext.com)...") | |
if _urltotext_key_exists: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY) | |
else: | |
logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.") | |
# --- Final check after all attempts --- | |
if not content and not user_feedback_message: | |
logger.error(f"[Task {task_id}] All website fetching methods (1, 5, 6, 3, 4, 2) failed for {url}") | |
user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?)." | |
# --- 3. Summarization --- | |
if content: | |
logger.info(f"[Task {task_id}] Content fetched successfully (len:{len(content)}). Generating summary.") | |
try: | |
status_update_msg_id = message_to_delete_later_id or status_message_id | |
if status_update_msg_id: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_update_msg_id, text=f"Content fetched! Now generating '{summary_type}' summary...", parse_mode=ParseMode.MARKDOWN, reply_markup=None ) | |
except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
final_summary = await generate_summary(content, summary_type) # Calls the updated function | |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): | |
user_feedback_message = final_summary | |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}") | |
else: | |
max_length = 4096 | |
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)] | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} ) | |
for part in summary_parts[1:]: await asyncio.sleep(0.5); await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} ) | |
success = True | |
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).") | |
user_feedback_message = None | |
# --- 4. Handle Final Failure Feedback --- | |
if user_feedback_message: | |
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}") | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} ) | |
except Exception as e: | |
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True) | |
user_feedback_message = "Oops! Something went really wrong during processing. Please try again later." | |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message ) | |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.") | |
except Exception as outer_e: | |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True) | |
try: | |
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred. Could not start processing." ) | |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.") | |
finally: | |
# --- 5. Cleanup --- | |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id | |
if delete_target_id and bot: | |
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id); logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}") | |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}") | |
if background_request and hasattr(background_request, '_client') and background_request._client: | |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.") | |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}") | |
logger.info(f"[Task {task_id}] Task completed. Success: {success}") | |
# --- Telegram Handlers --- | |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user; mention = user.mention_html() | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /start.") | |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" ) | |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user | |
if not user or not update.message: return | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /help.") | |
# Updated help text reflecting the new model order | |
help_text = ( "🔍 **How to use this bot:**\n\n" | |
"1. Send me any YouTube video link or website URL.\n" | |
"2. I'll ask how you want it summarised (paragraph or points).\n" | |
"3. Click the button for your choice.\n" | |
"4. Wait while I fetch the content and generate the summary!\n\n" | |
"⚙️ I try multiple methods to get content, especially for tricky websites or YouTube videos without standard transcripts. I then use a sequence of AI models (Gemini 2.5 Flash Preview, Gemini 2.5 Pro, Gemini 2.0 Flash, DeepSeek V3, Llama 4 Scout) to summarise.\n\n" # Updated model list here | |
"**Commands:**\n" | |
"`/start` - Display the welcome message\n" | |
"`/help` - Show this help message" ) | |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN) | |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
if not update.message or not update.message.text: return | |
url = update.message.text.strip(); user = update.effective_user | |
if not user: return | |
url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE) | |
match = url_pattern.search(url) | |
if match: | |
extracted_url = match.group(0) | |
logger.info(f"User {user.id} ({user.username or 'no_username'}) sent potential URL: {extracted_url}") | |
context.user_data['url_to_summarize'] = extracted_url | |
context.user_data['original_message_id'] = update.message.message_id | |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]] | |
reply_markup = InlineKeyboardMarkup(keyboard) | |
try: | |
await update.message.reply_text( | |
f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?", | |
reply_markup=reply_markup, | |
disable_web_page_preview=True, | |
parse_mode=ParseMode.MARKDOWN | |
) | |
except BadRequest as e: | |
if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower(): logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).") | |
else: logger.error(f"BadRequest replying to URL message from {user.id}: {e}") | |
except Exception as e: logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True) | |
else: | |
logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}") | |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
query = update.callback_query | |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return | |
user = query.from_user; summary_type = query.data; query_id = query.id | |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'no_username'})") | |
except BadRequest as e: | |
if "query is too old" in str(e).lower(): logger.warning(f"Callback query {query_id} is too old to answer."); return | |
else: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) | |
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) | |
url = context.user_data.get('url_to_summarize'); message_id_to_edit = query.message.message_id | |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}") | |
if not url: | |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Might be an old button click.") | |
try: await query.edit_message_text(text="Sorry, I couldn't find the original URL for this request (it might be too old). Please send the link again.") | |
except BadRequest as e: | |
if "message is not modified" in str(e).lower() or "message to edit not found" in str(e).lower(): pass | |
else: logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {e}") | |
except Exception as e: logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {e}") | |
return | |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}") | |
# Check essential configurations - requires at least ONE summarizer to be enabled | |
global TELEGRAM_TOKEN, _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled | |
if not TELEGRAM_TOKEN: | |
logger.critical("TELEGRAM_TOKEN missing in callback!") | |
try: await query.edit_message_text(text="❌ Bot configuration error (Token Missing). Cannot proceed.") | |
except Exception: pass | |
return | |
if not _groq_enabled and not _gemini_api_enabled and not _openrouter_fallback_enabled: | |
logger.critical("No summarization models are configured/valid in callback!") | |
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models available.") | |
except Exception: pass | |
return | |
# Log warnings if specific models/APIs are unavailable but don't stop the process if fallbacks exist | |
if not _groq_enabled: logger.warning("Primary AI (Groq Llama 4 Scout) is unavailable.") | |
if not _gemini_api_enabled: logger.warning("Gemini API is unavailable (skipping 2.5 Pro Exp & 2.0 Flash).") | |
if not _openrouter_fallback_enabled: logger.warning("Final Fallback AI (OpenRouter DeepSeek V3) is unavailable.") | |
logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}") | |
asyncio.create_task( | |
process_summary_task( | |
user_id=user.id, | |
chat_id=query.message.chat_id, | |
message_id_to_edit=message_id_to_edit, | |
url=url, | |
summary_type=summary_type, | |
bot_token=TELEGRAM_TOKEN | |
), | |
name=f"SummaryTask-{user.id}-{message_id_to_edit}" | |
) | |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Log Errors caused by Updates.""" | |
logger.error("Exception while handling an update:", exc_info=context.error) | |
# --- Application Setup & Web Framework --- | |
async def setup_bot_config() -> Application: | |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN | |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.") | |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 ) | |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build() | |
application.add_handler(CommandHandler("start", start)) | |
application.add_handler(CommandHandler("help", help_command)) | |
url_filter = filters.Entity("url") | filters.Entity("text_link") | |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND & url_filter, handle_potential_url)) | |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback)) | |
application.add_error_handler(error_handler) | |
logger.info("Telegram application handlers configured."); return application | |
async def lifespan(app: Starlette): | |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN | |
logger.info("ASGI Lifespan: Startup initiated..."); | |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.") | |
bot_setup_successful = False | |
try: | |
ptb_app = await setup_bot_config(); await ptb_app.initialize() | |
bot_info = await ptb_app.bot.get_me(); logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})") | |
bot_setup_successful = True | |
current_webhook_info = await ptb_app.bot.get_webhook_info() | |
if current_webhook_info and current_webhook_info.url: | |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete...") | |
try: | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Existing webhook deleted.") | |
else: logger.warning("Failed delete existing webhook (API returned False). Might be okay.") | |
except Exception as e: logger.warning(f"Could not delete existing webhook: {e}. Proceeding anyway."); await asyncio.sleep(1) | |
space_host = os.environ.get("SPACE_HOST"); webhook_path = "/webhook"; full_webhook_url = None | |
if space_host: | |
protocol = "https"; host = space_host.split('://')[-1]; full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}" | |
if full_webhook_url: | |
logger.info(f"Setting webhook to: {full_webhook_url}") | |
set_webhook_args = { "url": full_webhook_url, "allowed_updates": Update.ALL_TYPES, "drop_pending_updates": True } | |
if WEBHOOK_SECRET: set_webhook_args["secret_token"] = WEBHOOK_SECRET; logger.info("Webhook secret token is configured.") | |
await asyncio.sleep(1.5) | |
try: | |
await ptb_app.bot.set_webhook(**set_webhook_args) | |
webhook_info = await ptb_app.bot.get_webhook_info() | |
if webhook_info.url == full_webhook_url: logger.info(f"Webhook successfully set and verified: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}") | |
else: logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'.") | |
await ptb_app.start(); logger.info("PTB Application started in webhook mode.") | |
except Exception as e: logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed to set webhook: {e}") from e | |
else: logger.critical("Could not construct webhook URL from SPACE_HOST."); raise RuntimeError("Webhook URL could not be determined.") | |
else: logger.critical("SPACE_HOST environment variable not found. Cannot set webhook automatically."); raise RuntimeError("SPACE_HOST environment variable is missing.") | |
logger.info("ASGI Lifespan: Startup complete."); yield | |
except Exception as startup_err: | |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True) | |
if ptb_app and bot_setup_successful: | |
if ptb_app.running: await ptb_app.stop() | |
await ptb_app.shutdown() | |
raise | |
finally: | |
logger.info("ASGI Lifespan: Shutdown initiated...") | |
if ptb_app and bot_setup_successful: | |
if ptb_app.running: logger.info("Stopping PTB Application..."); await ptb_app.stop() | |
logger.info("Shutting down PTB Application..."); await ptb_app.shutdown(); logger.info("PTB Application shut down.") | |
try: | |
logger.info("Attempting to delete webhook on shutdown...") | |
if ptb_app.bot and hasattr(ptb_app.bot, 'delete_webhook'): | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted on shutdown.") | |
else: logger.warning("Failed to delete webhook on shutdown (API returned False).") | |
else: logger.warning("Cannot delete webhook: Bot object unavailable.") | |
except Exception as e: logger.warning(f"Could not delete webhook during shutdown: {e}") | |
else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.") | |
logger.info("ASGI Lifespan: Shutdown complete.") | |
async def health_check(request: Request) -> PlainTextResponse: | |
"""Simple health check endpoint.""" | |
# Include the new model variable | |
global GEMINI_FLASH_PREVIEW_MODEL, GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL | |
global APIFY_ACTOR_ID, APIFY_STRUCTURED_YT_ACTOR_ID | |
global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled | |
global _apify_token_exists, _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY | |
bot_status = "Not Initialized"; bot_username = "N/A" | |
if ptb_app: | |
try: | |
# Check if bot is running and get info | |
if ptb_app.running: | |
# Wrap get_me in a try-except in case the bot disconnects during check | |
try: | |
bot_info = await ptb_app.bot.get_me(); bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error"; bot_status = "Running" | |
except (TimedOut, NetworkError, TelegramError) as bot_err: | |
logger.warning(f"Health check: Error getting bot info while running: {bot_err}") | |
bot_status = "Running (Info Error)"; bot_username = "Fetch Error" | |
else: | |
bot_status = "Initialized but Not Running" | |
# Try getting info even if not 'running' (might be initializing/shutting down) | |
if ptb_app.bot: | |
try: bot_info = await ptb_app.bot.get_me(); bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error" | |
except (TimedOut, NetworkError, TelegramError) as bot_err: | |
logger.warning(f"Health check: Error getting bot info while not running: {bot_err}") | |
bot_username = "Info Fetch Error (Not Running)" | |
except Exception as e: # Catch broader exceptions during status check | |
bot_status = f"Error checking status: {type(e).__name__}"; logger.warning(f"Health check: General error getting bot status: {e}") | |
bot_username = "Error" | |
else: bot_status = "Not Initialized"; bot_username = "N/A" | |
# <<< Update response string with the NEW YT fallback order >>> | |
return PlainTextResponse( | |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n" | |
f"---\n" | |
f"Summarizer Priority (April 2025 - Updated):\n" | |
f"1. Gemini API: {GEMINI_FLASH_PREVIEW_MODEL if _gemini_api_enabled else 'DISABLED'} (NEW)\n" | |
f"2. Gemini API: {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}\n" | |
f"3. Gemini API: {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}\n" | |
f"4. OpenRouter API: {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n" | |
f"5. Groq API: {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'} (Last Fallback)\n" | |
f"---\n" | |
f"Content Fetching Status:\n" | |
# --- YT Fallback List (NEW ORDER) --- | |
f"YT Primary (Lib): REMOVED\n" | |
f"YT Fallback 1 (Apify Default): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n" | |
f"YT Fallback 2 (Apify Structured): {APIFY_STRUCTURED_YT_ACTOR_ID if _apify_token_exists else 'DISABLED'}\n" | |
f"YT Fallback 3 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n" | |
# --- Web Scrape Fallback List (Order Unchanged) --- | |
f"Web Scrape 1 (Direct+BS4): Enabled\n" | |
f"Web Scrape 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n" | |
f"Web Scrape 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}\n" | |
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}" | |
) | |
async def telegram_webhook(request: Request) -> Response: | |
"""Handles incoming updates from Telegram.""" | |
global WEBHOOK_SECRET, ptb_app # Ensure ptb_app is accessible | |
if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503) | |
if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503) | |
if WEBHOOK_SECRET: | |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token") | |
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403) | |
try: | |
update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot) | |
logger.debug(f"Processing update_id: {update.update_id} via webhook"); await ptb_app.process_update(update) | |
return Response(status_code=200) | |
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400) | |
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # Return 200 to TG even if processing fails, to avoid retries | |
# --- Starlette App Definition --- | |
app = Starlette( | |
debug=False, # Keep False for production/Hugging Face | |
lifespan=lifespan, | |
routes=[ | |
Route("/", endpoint=health_check, methods=["GET"]), | |
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), | |
] | |
) | |
logger.info("Starlette ASGI application created with health check and webhook routes.") | |
# --- Development Server (if run directly) --- | |
if __name__ == '__main__': | |
import uvicorn | |
# Need to import google.api_core.exceptions for the explicit check in _call_gemini error handling | |
try: | |
import google.api_core.exceptions | |
except ImportError: | |
pass # It's already handled by _gemini_sdk_available flag | |
logger.warning("Running in development mode using Uvicorn directly - NOT for production!") | |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower() | |
local_port = int(os.environ.get('PORT', 8080)) | |
uvicorn.run( "__main__:app", host='0.0.0.0', port=local_port, log_level=log_level, reload=True ) |