Spaces:
Running
Running
# main.py (Modified to add crawl4ai and adjust fetching logic) | |
import os | |
import re | |
import logging | |
import asyncio | |
import json | |
import html | |
import contextlib | |
import traceback | |
from typing import Optional, Dict, Any, Tuple | |
# --- Frameworks --- | |
from starlette.applications import Starlette | |
from starlette.routing import Route | |
from starlette.responses import PlainTextResponse, JSONResponse, Response | |
from starlette.requests import Request | |
# --- Telegram Bot --- | |
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot | |
from telegram.ext import ( | |
Application, | |
CommandHandler, | |
MessageHandler, | |
filters, | |
ContextTypes, | |
CallbackQueryHandler, | |
) | |
from telegram.constants import ParseMode | |
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError | |
from telegram.request import HTTPXRequest, BaseRequest | |
# --- Other Libraries --- | |
import httpx | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
from bs4 import BeautifulSoup | |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log | |
try: | |
import lxml | |
DEFAULT_PARSER = 'lxml' | |
except ImportError: | |
DEFAULT_PARSER = 'html.parser' | |
# --- Google Gemini --- | |
try: | |
import google.generativeai as genai | |
from google.generativeai.types import HarmCategory, HarmBlockThreshold | |
_gemini_available = True | |
except ImportError: | |
genai = None | |
HarmCategory = None | |
HarmBlockThreshold = None | |
_gemini_available = False | |
# logger will be defined later, log warning after logger setup | |
# --- Crawl4AI (New Primary Web Scraper) --- | |
try: | |
from crawl4ai import AsyncWebCrawler | |
_crawl4ai_available = True | |
except ImportError: | |
AsyncWebCrawler = None | |
_crawl4ai_available = False | |
# logger will be defined later | |
# --- Logging Setup --- | |
logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO ) | |
logging.getLogger("httpx").setLevel(logging.WARNING) | |
logging.getLogger("telegram.ext").setLevel(logging.INFO) | |
logging.getLogger('telegram.bot').setLevel(logging.INFO) | |
logging.getLogger("urllib3").setLevel(logging.INFO) | |
logging.getLogger('gunicorn.error').setLevel(logging.INFO) | |
logging.getLogger('uvicorn').setLevel(logging.INFO) | |
logging.getLogger('starlette').setLevel(logging.INFO) | |
if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING) | |
# Reduce crawl4ai logging noise if needed | |
if _crawl4ai_available: | |
logging.getLogger("crawl4ai").setLevel(logging.WARNING) # Or INFO for more detail | |
logging.getLogger("playwright").setLevel(logging.WARNING) | |
logger = logging.getLogger(__name__) | |
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}") | |
if not _gemini_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.") | |
if not _crawl4ai_available: logger.warning("crawl4ai library not found. Primary website scraping will be disabled.") | |
# --- Global variable for PTB app --- | |
ptb_app: Optional[Application] = None | |
# --- Environment Variable Loading & Configuration --- | |
logger.info("Attempting to load secrets and configuration...") | |
def get_secret(secret_name): | |
value = os.environ.get(secret_name) | |
if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)") | |
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}") | |
return value | |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN') | |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Now Fallback | |
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Fallback Web 2 | |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # Fallback YT 1 | |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # Fallback YT 2 | |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET') | |
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer | |
# Models (User can still configure via env vars) | |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Summarizer Model | |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") | |
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary Summarizer Model | |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.") | |
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.") | |
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.") | |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY) | |
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.") | |
elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY not found or empty. Gemini disabled.") | |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY) | |
if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization disabled.") | |
_crawl4ai_primary_web_enabled = _crawl4ai_available | |
if not _crawl4ai_primary_web_enabled: logger.warning("⚠️ WARNING: crawl4ai library not found. Primary web scraping disabled.") | |
_bs4_fallback_web_enabled = True # Assumes bs4 is always available | |
_urltotext_fallback_web_enabled = bool(URLTOTEXT_API_KEY) | |
if not _urltotext_fallback_web_enabled: logger.info("ℹ️ INFO: URLTOTEXT_API_KEY not found. Secondary web fallback disabled.") | |
# Fallback YT checks | |
if not SUPADATA_API_KEY: logger.info("ℹ️ INFO: SUPADATA_API_KEY not found. First YT fallback disabled.") | |
_apify_token_exists = bool(APIFY_API_TOKEN) | |
if not _apify_token_exists: logger.info("ℹ️ INFO: APIFY_API_TOKEN not found. Second YT fallback disabled.") | |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.") | |
logger.info("Secret loading and configuration check finished.") | |
logger.info(f"Primary Web Scraper: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED'}") | |
logger.info(f"Fallback Web Scraper 1: {'BeautifulSoup' if _bs4_fallback_web_enabled else 'DISABLED'}") | |
logger.info(f"Fallback Web Scraper 2: {'urltotext.com API' if _urltotext_fallback_web_enabled else 'DISABLED'}") | |
logger.info(f"Using Gemini Model (Primary Summarizer): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}") | |
logger.info(f"Using OpenRouter Model (Fallback Summarizer): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}") | |
logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}") | |
if _gemini_primary_enabled: | |
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.") | |
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_primary_enabled = False | |
# --- Retry Decorator --- | |
async def retry_bot_operation(func, *args, **kwargs): | |
try: return await func(*args, **kwargs) | |
except BadRequest as e: | |
ignore_errors = [ "message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user", ] | |
if any(err in str(e).lower() for err in ignore_errors): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None | |
logger.error(f"Potentially critical BadRequest: {e}"); raise | |
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise | |
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise | |
# --- Helper Functions --- | |
def is_youtube_url(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match) | |
def extract_youtube_id(url): | |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE) | |
match = youtube_regex.search(url) | |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id | |
else: logger.warning(f"Could not extract YT ID from {url}"); return None | |
# --- Content Fetching Functions --- | |
# --- YouTube Fetching (Unchanged) --- | |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]: | |
if not video_id: logger.error("[Supadata] No video_id provided"); return None | |
if not api_key: logger.error("[Supadata] API key missing."); return None | |
logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}") | |
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript" # Corrected URL | |
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key} | |
try: | |
async with httpx.AsyncClient(timeout=30.0) as client: | |
response = await client.get(api_endpoint, headers=headers, params=params) | |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}") | |
if response.status_code == 200: | |
try: | |
try: data = response.json() | |
except json.JSONDecodeError: data = None | |
content = None | |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data") | |
if not content and response.text: content = response.text | |
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None | |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None | |
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None | |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None | |
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None | |
except httpx.RequestError as e: | |
if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}") | |
else: logger.error(f"[Supadata] Request error for {video_id}: {e}") | |
return None | |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None | |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]: | |
"""Fetches YouTube transcript using Apify REST API (run-sync-get-dataset-items endpoint).""" | |
global APIFY_ACTOR_ID | |
if not video_url: logger.error("[Apify SyncItems] No video_url provided"); return None | |
if not api_token: logger.error("[Apify SyncItems] API token missing."); return None | |
logger.info(f"[Apify SyncItems] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})") | |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items" | |
params = {"token": api_token} | |
payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, } | |
headers = {"Content-Type": "application/json"} | |
try: | |
async with httpx.AsyncClient(timeout=120.0) as client: | |
log_headers = {k: v for k, v in headers.items()} | |
logger.debug(f"[Apify SyncItems] POST Request Details:\nURL: {sync_items_endpoint}\nParams: {params}\nHeaders: {log_headers}\nPayload: {json.dumps(payload)}") | |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload) | |
logger.debug(f"[Apify SyncItems] Received status code {response.status_code} for {video_url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0] | |
content = None | |
if "captions" in item and isinstance(item["captions"], str): logger.info("[Apify SyncItems] Found 'captions' key with string content."); content = item["captions"] | |
elif "text" in item and isinstance(item["text"], str): logger.info("[Apify SyncItems] Found 'text' key with string content."); content = item["text"] | |
elif "transcript" in item and isinstance(item["transcript"], str): logger.info("[Apify SyncItems] Found 'transcript' key with string content."); content = item["transcript"] | |
elif "captions" in item and isinstance(item["captions"], list): | |
logger.warning("[Apify SyncItems] Received list format for 'captions' unexpectedly. Processing...") | |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text")) | |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"]) | |
if content and isinstance(content, str): logger.info(f"[Apify SyncItems] Success via REST for {video_url}. Length: {len(content)}"); return content.strip() | |
else: logger.warning(f"[Apify SyncItems] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None | |
else: logger.warning(f"[Apify SyncItems] Actor success but dataset was empty for {video_url}. Response: {results}"); return None | |
except json.JSONDecodeError: logger.error(f"[Apify SyncItems] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None | |
except Exception as e: logger.error(f"[Apify SyncItems] Error processing success response for {video_url}: {e}", exc_info=True); return None | |
elif response.status_code == 400: logger.error(f"[Apify SyncItems] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None | |
elif response.status_code == 401: logger.error("[Apify SyncItems] Auth error (401). Check token."); return None | |
elif response.status_code == 404: | |
error_info = ""; | |
try: error_info = response.json().get("error", {}).get("message", "") | |
except Exception: pass | |
logger.error(f"[Apify SyncItems] Endpoint/Actor Not Found (404). Error: '{error_info}' Resp:{response.text[:200]}"); | |
return None | |
else: logger.error(f"[Apify SyncItems] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException as e: logger.error(f"[Apify SyncItems] Timeout during API interaction for {video_url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Apify SyncItems] HTTP Status Error during API interaction for {video_url}: {e}"); return None | |
except httpx.RequestError as e: logger.error(f"[Apify SyncItems] Request error during API interaction for {video_url}: {e}"); return None | |
except Exception as e: logger.error(f"[Apify SyncItems] Unexpected error during Apify SyncItems REST call for {video_url}: {e}", exc_info=True); return None | |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]: | |
global SUPADATA_API_KEY, APIFY_API_TOKEN | |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None | |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})") | |
transcript_text = None | |
logger.info("[Primary YT] Attempting youtube-transcript-api...") | |
try: | |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] ) | |
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item]) | |
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text | |
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None | |
except Exception as e: | |
logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}") | |
if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found.") | |
elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled.") | |
transcript_text = None | |
if transcript_text is None: | |
logger.info("[Fallback YT 1] Trying Supadata API...") | |
if SUPADATA_API_KEY: | |
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY) | |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.") | |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.") | |
if transcript_text is None: | |
logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...") | |
if APIFY_API_TOKEN: | |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN) | |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text | |
else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.") | |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.") | |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None | |
return transcript_text | |
# --- Website Content Fetching --- | |
# NEW: Primary Method using Crawl4AI | |
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]: | |
"""Fetches website content using Crawl4AI. Returns Markdown.""" | |
global _crawl4ai_primary_web_enabled | |
if not _crawl4ai_primary_web_enabled: | |
logger.warning("[Crawl4AI Primary] Called but disabled/unavailable.") | |
return None | |
if not url: logger.error("[Crawl4AI Primary] No URL provided"); return None | |
logger.info(f"[Crawl4AI Primary] Attempting fetch for: {url}") | |
try: | |
# Using async with for proper resource cleanup | |
async with AsyncWebCrawler(headless=True) as crawler: # Headless is generally preferred for server environments | |
# Timeout can be added here if needed: crawler_params={"timeout": 60000} # milliseconds | |
result = await crawler.arun(url=url) | |
if result and result.markdown: | |
logger.info(f"[Crawl4AI Primary] Success via Crawl4AI for {url}. Markdown length: {len(result.markdown)}") | |
return result.markdown.strip() | |
elif result and not result.markdown: | |
logger.warning(f"[Crawl4AI Primary] Crawl4AI ran but returned empty markdown for {url}.") | |
return None | |
else: | |
logger.warning(f"[Crawl4AI Primary] Crawl4AI returned no result object for {url}.") | |
return None | |
except ImportError: | |
logger.error("[Crawl4AI Primary] Import Error - library might be missing.") | |
_crawl4ai_primary_web_enabled = False # Disable if import fails at runtime | |
return None | |
except Exception as e: | |
# Catch potential Playwright errors, timeouts, or other issues | |
logger.error(f"[Crawl4AI Primary] Error during Crawl4AI execution for {url}: {e}", exc_info=True) | |
return None | |
# HELPER: Used by Fallback 1 (BS4) - no changes needed here | |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]: | |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' } | |
try: | |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client: | |
logger.debug(f"[Web Scrape Helper] Sending request to {url}") | |
response = await client.get(url) | |
logger.debug(f"[Web Scrape Helper] Received response {response.status_code} from {url}") | |
response.raise_for_status() | |
content_type = response.headers.get('content-type', '').lower() | |
if 'html' not in content_type: logger.warning(f"[Web Scrape Helper] Non-HTML content type from {url}: {content_type}"); return None | |
try: return response.text | |
except Exception as e: logger.error(f"[Web Scrape Helper] Error decoding response for {url}: {e}"); return None | |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Helper] HTTP error {e.response.status_code} fetching {url}: {e}") | |
except httpx.TimeoutException: logger.error(f"[Web Scrape Helper] Timeout error fetching {url}") | |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Helper] Too many redirects fetching {url}") | |
except httpx.RequestError as e: logger.error(f"[Web Scrape Helper] Request error fetching {url}: {e}") | |
except Exception as e: logger.error(f"[Web Scrape Helper] Unexpected error fetching {url}: {e}", exc_info=True) | |
return None | |
# Fallback 1: Direct BS4 Scraping (Renamed original function) | |
async def get_website_content_via_bs4(url: str) -> Optional[str]: | |
"""Fetches and extracts text content using BeautifulSoup (Fallback 1).""" | |
global _bs4_fallback_web_enabled | |
if not _bs4_fallback_web_enabled: | |
logger.warning("[BS4 Fallback] Called but disabled.") # Should not happen unless manually disabled | |
return None | |
if not url: logger.error("[BS4 Fallback] No URL provided"); return None | |
logger.info(f"[BS4 Fallback] Fetching website content for: {url}") | |
html_content = await fetch_url_content_for_scrape(url) # Use the helper | |
if not html_content: | |
logger.warning(f"[BS4 Fallback] fetch_url_content_for_scrape failed for {url}") | |
return None | |
try: | |
def parse_html(content): | |
soup = BeautifulSoup(content, DEFAULT_PARSER) | |
# Keep the existing cleaning logic | |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure"]): element.extract() | |
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main') | |
target_element = main_content if main_content else soup.body | |
if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None | |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()] | |
text = " ".join(lines) | |
if not text: logger.warning(f"[BS4 Fallback] Extracted text empty after clean for {url}"); return None | |
return text | |
text_content = await asyncio.to_thread(parse_html, html_content) | |
if text_content: logger.info(f"[BS4 Fallback] Success scrape for {url} (final len: {len(text_content)})"); return text_content | |
else: | |
logger.warning(f"[BS4 Fallback] parse_html returned None for {url}") | |
return None | |
except Exception as e: logger.error(f"[BS4 Fallback] Error scraping/parsing {url}: {e}", exc_info=True); return None | |
# Fallback 2: urltotext.com API (Unchanged function) | |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]: | |
"""Fetches website content using urltotext.com API (Fallback 2).""" | |
global _urltotext_fallback_web_enabled | |
if not _urltotext_fallback_web_enabled: | |
logger.warning("[urltotext API Fallback] Called but disabled (no API key).") | |
return None | |
if not url: logger.error("[urltotext API Fallback] No URL"); return None | |
if not api_key: logger.error("[urltotext API Fallback] urltotext.com API key missing."); return None # Redundant check but safe | |
logger.info(f"[urltotext API Fallback] Attempting fetch for: {url} using urltotext.com API") | |
api_endpoint = "https://urltotext.com/api/v1/urltotext/" | |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False } | |
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" } | |
try: | |
async with httpx.AsyncClient(timeout=45.0) as client: | |
logger.debug(f"[urltotext API Fallback] Sending request to urltotext.com API for {url}") | |
response = await client.post(api_endpoint, headers=headers, json=payload) | |
logger.debug(f"[urltotext API Fallback] Received status {response.status_code} from urltotext.com API for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning") | |
if warning: logger.warning(f"[urltotext API Fallback] urltotext.com API Warning for {url}: {warning}") | |
if content: logger.info(f"[urltotext API Fallback] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip() | |
else: logger.warning(f"[urltotext API Fallback] urltotext.com API success but content empty for {url}. Resp: {data}"); return None | |
except json.JSONDecodeError: logger.error(f"[urltotext API Fallback] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None | |
except Exception as e: logger.error(f"[urltotext API Fallback] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None | |
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[urltotext API Fallback] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
else: logger.error(f"[urltotext API Fallback] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None | |
except httpx.TimeoutException: logger.error(f"[urltotext API Fallback] Timeout connecting to urltotext.com API for {url}"); return None | |
except httpx.RequestError as e: logger.error(f"[urltotext API Fallback] Request error connecting to urltotext.com API for {url}: {e}"); return None | |
except Exception as e: logger.error(f"[urltotext API Fallback] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None | |
# --- Summarization Functions (Unchanged) --- | |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call Gemini API. Returns (summary, error_message).""" | |
global GEMINI_MODEL, _gemini_primary_enabled | |
if not _gemini_primary_enabled: | |
logger.error("[Gemini Primary] Called but is disabled."); | |
return None, "Error: Primary AI service (Gemini) not configured/available." | |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}") | |
# Define prompts (Keep existing prompts) | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
# Input Length Check (Gemini-specific limits if known, otherwise use a large default) | |
MAX_INPUT_LENGTH_GEMINI = 1000000 # Default limit for Gemini | |
if len(text) > MAX_INPUT_LENGTH_GEMINI: | |
logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
# Safety Settings (Block None) | |
safety_settings = { | |
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, | |
getattr(HarmCategory, 'HARM_CATEGORY_CIVIC_INTEGRITY', None): HarmBlockThreshold.BLOCK_NONE | |
} | |
safety_settings = {k: v for k, v in safety_settings.items() if k is not None} | |
logger.debug(f"[Gemini Primary] Using safety settings: {safety_settings}") | |
try: | |
logger.debug(f"[Gemini Primary] Initializing model {GEMINI_MODEL}") | |
model = genai.GenerativeModel(GEMINI_MODEL) | |
logger.info(f"[Gemini Primary] Sending request to Gemini ({GEMINI_MODEL})...") | |
request_options = {"timeout": 120} # Generous timeout for Gemini | |
response = await model.generate_content_async( full_prompt, safety_settings=safety_settings, request_options=request_options ) | |
logger.info("[Gemini Primary] Received response from Gemini.") | |
# Check for immediate blocking reasons | |
if response.prompt_feedback.block_reason: | |
logger.warning(f"[Gemini Primary] Request blocked by API. Reason: {response.prompt_feedback.block_reason}"); | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the request (Reason: {response.prompt_feedback.block_reason})." | |
# Check candidate-level blocking | |
for cand in response.candidates: | |
if cand.finish_reason == 'SAFETY': | |
logger.warning(f"[Gemini Primary] Candidate blocked due to SAFETY. Ratings: {cand.safety_ratings}") | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the response due to safety filters." | |
# Try to get text, handle potential ValueError if blocked | |
try: | |
summary = response.text | |
except ValueError as e: | |
logger.warning(f"[Gemini Primary] Error accessing response text (likely blocked content): {e}"); | |
summary = None | |
if summary: | |
logger.info(f"[Gemini Primary] Success generating summary. Output len: {len(summary)}"); | |
return summary.strip(), None | |
else: | |
finish_reason = response.candidates[0].finish_reason if response.candidates else 'N/A' | |
logger.warning(f"[Gemini Primary] Gemini returned empty summary or content was blocked. Finish reason: {finish_reason}"); | |
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) did not provide a summary (Finish Reason: {finish_reason})." | |
except Exception as e: | |
logger.error(f"[Gemini Primary] Unexpected error during Gemini API call: {e}", exc_info=True); | |
return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GEMINI_MODEL})." | |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]: | |
"""Internal function to call OpenRouter API (Fallback). Returns (summary, error_message).""" | |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled | |
if not _openrouter_fallback_enabled: | |
logger.error("[OpenRouter Fallback] Called but is disabled."); | |
return None, "Error: Fallback AI service (OpenRouter) not configured/available." | |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}") | |
# Define prompts (Keep existing prompts) | |
if summary_type == "paragraph": | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" | |
"• Clear and simple language suitable for someone unfamiliar with the topic.\n" | |
"• Uses British English spellings throughout.\n" | |
"• Straightforward and understandable vocabulary; avoid complex terms.\n" | |
"• Presented as ONE SINGLE PARAGRAPH.\n" | |
"• No more than 85 words maximum; but does not have to be exactly 85.\n" | |
"• Considers the entire text content equally.\n" | |
"• Uses semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
else: # points summary | |
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n" | |
"• For each distinct topic or section identified in the text, create a heading.\n" | |
"• Each heading MUST be plain text without any formatting (e.g., Section Title).\n" | |
"• Immediately following each heading, list the key points as a bulleted list.\n" | |
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" | |
"• The text within each bullet point should NOT contain any bold formatting.\n" | |
"• IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n" | |
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" | |
"• Use British English spellings throughout.\n" | |
"• Avoid overly complex or advanced vocabulary.\n" | |
"• Keep bullet points concise.\n" | |
"• Ensure the entire summary takes no more than two minutes to read.\n" | |
"• Consider the entire text's content, not just the beginning or a few topics.\n" | |
"• Use semicolons (;) instead of em dashes (– or —).\n" | |
"• **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n" # Added instruction | |
"Here is the text to summarise:") | |
# Input Length Check (OpenRouter-specific limit) | |
MAX_INPUT_LENGTH_OR = 500000 | |
if len(text) > MAX_INPUT_LENGTH_OR: | |
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_OR}). Truncating."); | |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" } | |
payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] } | |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions" | |
# Use reasonable timeouts for fallback | |
api_timeouts = httpx.Timeout(25.0, read=20.0, write=25.0, pool=60.0) | |
response = None | |
try: | |
async with httpx.AsyncClient(timeout=api_timeouts) as client: | |
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_MODEL}) with read timeout {api_timeouts.read}s...") | |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload) | |
if response: logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}") | |
else: logger.error("[OpenRouter Fallback] No response from OpenRouter (unexpected)."); return None, "Sorry, fallback AI service failed unexpectedly." | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0: | |
message = data["choices"][0].get("message") | |
if message and isinstance(message, dict): | |
summary = message.get("content") | |
if summary: | |
logger.info(f"[OpenRouter Fallback] Success via OpenRouter. Output len: {len(summary)}") | |
return summary.strip(), None | |
else: | |
logger.warning(f"[OpenRouter Fallback] OpenRouter success but content empty. Resp: {data}") | |
return None, "Sorry, the fallback AI model returned an empty summary." | |
else: | |
logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Full: {data}") | |
return None, "Sorry, could not parse fallback AI response (format)." | |
else: | |
logger.error(f"[OpenRouter Fallback] Unexpected choices structure: {data.get('choices')}. Full: {data}") | |
return None, "Sorry, could not parse fallback AI response (choices)." | |
except json.JSONDecodeError: | |
logger.error(f"[OpenRouter Fallback] Failed JSON decode OpenRouter. Status:{response.status_code}. Resp:{response.text[:500]}") | |
return None, "Sorry, failed to understand fallback AI response." | |
except Exception as e: | |
logger.error(f"[OpenRouter Fallback] Error processing OpenRouter success response: {e}", exc_info=True) | |
return None, "Sorry, error processing fallback AI response." | |
# Handle specific error codes | |
elif response.status_code == 401: logger.error("[OpenRouter Fallback] API key invalid (401)."); return None, "Error: Fallback AI model configuration key is invalid." | |
elif response.status_code == 402: logger.error("[OpenRouter Fallback] Payment Required (402)."); return None, "Sorry, fallback AI service limits/payment issue." | |
elif response.status_code == 429: logger.warning("[OpenRouter Fallback] Rate Limit Exceeded (429)."); return None, "Sorry, fallback AI model is busy. Try again." | |
elif response.status_code == 500: logger.error(f"[OpenRouter Fallback] Internal Server Error (500). Resp:{response.text[:500]}"); return None, "Sorry, fallback AI service internal error." | |
else: | |
error_info = ""; | |
try: error_info = response.json().get("error", {}).get("message", "") | |
except Exception: pass | |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}"); | |
return None, f"Sorry, fallback AI service returned unexpected status ({response.status_code})." | |
except httpx.TimeoutException as e: | |
logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}") | |
return None, f"Sorry, the fallback AI service ({OPENROUTER_MODEL}) timed out." | |
except httpx.RequestError as e: | |
logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}") | |
return None, "Sorry, there was an error connecting to the fallback AI model service." | |
except Exception as e: | |
logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True) | |
return None, "Sorry, an unexpected error occurred while using the fallback AI service." | |
async def generate_summary(text: str, summary_type: str) -> str: | |
""" | |
Generates summary using Gemini (Primary) and falls back to OpenRouter if needed. | |
Ensures point-based summaries have plain text headings. | |
""" | |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL | |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})") | |
final_summary: Optional[str] = None | |
error_message: Optional[str] = None | |
# --- Attempt 1: Gemini (Primary) --- | |
if _gemini_primary_enabled: | |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})") | |
final_summary, error_message = await _call_gemini(text, summary_type) | |
if final_summary: | |
logger.info(f"[Summary Generation] Success with primary AI (Gemini).") | |
# Return directly - prompt handles formatting | |
return final_summary | |
else: | |
logger.warning(f"[Summary Generation] Primary AI (Gemini) failed or returned unusable result. Error: {error_message}. Proceeding to fallback.") | |
else: | |
logger.warning("[Summary Generation] Primary AI (Gemini) is disabled or unavailable. Proceeding directly to fallback.") | |
error_message = "Primary AI (Gemini) unavailable." # Set initial error message | |
# --- Attempt 2: OpenRouter (Fallback) --- | |
if _openrouter_fallback_enabled: | |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})") | |
fallback_summary, fallback_error = await _call_openrouter(text, summary_type) | |
if fallback_summary: | |
logger.info(f"[Summary Generation] Success with fallback AI (OpenRouter).") | |
# Return directly - prompt handles formatting | |
return fallback_summary | |
else: | |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error}") | |
# Combine error messages if possible | |
if error_message: # Keep the primary error if it exists | |
return f"{error_message} Fallback AI ({OPENROUTER_MODEL}) also failed: {fallback_error}" | |
else: # If primary was skipped | |
return f"Fallback AI ({OPENROUTER_MODEL}) failed: {fallback_error}" | |
else: | |
logger.error("[Summary Generation] Fallback AI (OpenRouter) is disabled or unavailable. Cannot proceed.") | |
if error_message: # Return the primary error | |
return f"{error_message} Fallback AI is also unavailable." | |
else: # Should not happen if logic is correct, but safeguard | |
return "Error: Both primary and fallback AI services are unavailable." | |
# Should ideally not be reached if logic above is sound, but as a final catch | |
logger.error("[Summary Generation] Reached end of function without returning a summary or specific error.") | |
return "Sorry, an unknown error occurred during summary generation after trying all available models." | |
# --- Main Task Processing (Modified Web Fetching Logic) --- | |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None: | |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}") | |
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None | |
try: background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 ); bot = Bot(token=bot_token, request=background_request) | |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return | |
content: Optional[str] = None | |
user_feedback_message: Optional[str] = None | |
success = False | |
status_message_id = message_id_to_edit | |
message_to_delete_later_id : Optional[int] = None | |
try: | |
# Send initial "Processing..." message (or edit existing) | |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nThis might take a moment..." | |
if status_message_id: | |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'") | |
except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None | |
if not status_message_id: | |
try: | |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN ) | |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}") | |
else: raise RuntimeError("Failed to send status message after retries.") | |
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise | |
try: | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
is_youtube = is_youtube_url(url) | |
logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}") | |
# --- YouTube Processing (Unchanged) --- | |
if is_youtube: | |
video_id = extract_youtube_id(url) | |
if video_id: | |
content = await get_youtube_transcript(video_id, url) # Tries lib -> Supadata -> Apify | |
else: | |
user_feedback_message = "Sorry, I couldn't understand that YouTube URL format." | |
if not content and not user_feedback_message: | |
user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)." | |
# --- Website Processing (NEW Logic) --- | |
else: | |
# Method 1: Crawl4AI (Primary) | |
if _crawl4ai_primary_web_enabled: | |
logger.info(f"[Task {task_id}] Trying primary web method: Crawl4AI") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_crawl4ai(url) | |
if content: | |
logger.info(f"[Task {task_id}] Success via Crawl4AI for {url} (len: {len(content)})") | |
else: | |
logger.warning(f"[Task {task_id}] Crawl4AI failed or returned empty for {url}.") | |
else: | |
logger.warning(f"[Task {task_id}] Crawl4AI is disabled. Skipping.") | |
# Method 2: BeautifulSoup (Fallback 1) | |
if not content and _bs4_fallback_web_enabled: | |
logger.warning(f"[Task {task_id}] Trying fallback web method 1: BeautifulSoup") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_bs4(url) # Use the renamed function | |
if content: | |
logger.info(f"[Task {task_id}] Success via BS4 scrape for {url} (len: {len(content)})") | |
else: | |
logger.warning(f"[Task {task_id}] BS4 scrape failed or returned empty for {url}.") | |
# Method 3: urltotext.com API (Fallback 2) | |
if not content and _urltotext_fallback_web_enabled: | |
logger.warning(f"[Task {task_id}] Trying fallback web method 2: urltotext.com API") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY) # API key checked inside function | |
if content: | |
logger.info(f"[Task {task_id}] Success via urltotext.com API for {url} (len: {len(content)})") | |
else: | |
logger.warning(f"[Task {task_id}] urltotext.com API failed or returned empty for {url}.") | |
# Final check if any website method succeeded | |
if not content: | |
methods_tried = [] | |
if _crawl4ai_primary_web_enabled: methods_tried.append("Crawl4AI") | |
if _bs4_fallback_web_enabled: methods_tried.append("BS4") | |
if _urltotext_fallback_web_enabled: methods_tried.append("API") | |
tried_str = ", ".join(methods_tried) if methods_tried else "configured methods" | |
user_feedback_message = f"Sorry, I couldn't fetch content from that website using any available method ({tried_str}). It might be blocked, inaccessible, or empty." | |
# --- Summarization --- | |
if content: | |
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.") | |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') | |
# NOTE: `content` might be Markdown (from Crawl4AI) or plain text (from others). | |
# The LLM prompts should handle this reasonably well. | |
final_summary = await generate_summary(content, summary_type) # Calls Gemini -> OpenRouter | |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): | |
user_feedback_message = final_summary # Pass AI error message to user | |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}") | |
else: | |
max_length = 4096; summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)] | |
# Send summary without explicit Markdown parsing, assuming LLM followed instructions | |
# for plain headings and standard bullet points. Using parse_mode=None. | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} ) | |
for part in summary_parts[1:]: | |
await asyncio.sleep(0.5) | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} ) | |
success = True | |
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).") | |
user_feedback_message = None # Clear any previous fetch error if summary succeeded | |
# --- Send Feedback if Fetching Failed --- | |
elif user_feedback_message: | |
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}") | |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} ) | |
except Exception as e: | |
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True) | |
user_feedback_message = "Oops! Something went really wrong while processing your request. Please try again later." | |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message ) | |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.") | |
except Exception as outer_e: | |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True) | |
try: | |
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred processing your request." ) | |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.") | |
finally: | |
# Delete the "Processing..." or button message | |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id | |
if delete_target_id and bot: | |
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id); logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}") | |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}") | |
# Close the background bot's HTTP client | |
if background_request and hasattr(background_request, '_client') and background_request._client: | |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.") | |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}") | |
logger.info(f"[Task {task_id}] Task completed. Success: {success}") | |
# --- Bot Handlers (Unchanged) --- | |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user; mention = user.mention_html() | |
if not user or not update.message: return | |
logger.info(f"User {user.id} used /start.") | |
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" ) | |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
user = update.effective_user | |
if not user or not update.message: return | |
logger.info(f"User {user.id} used /help.") | |
help_text = ( "🔍 How to use this bot:\n\n" "1. Send me any YouTube video link or website URL.\n" "2. I'll ask you how you want it summarised (paragraph or points).\n" | |
"3. Click the button for your choice.\n" "4. Wait for the summary!\n\n" "I'll try multiple methods to get content if the first one fails (especially for YouTube transcripts).\n\n" "Commands:\n" "`/start` - Display welcome message\n" "`/help` - Show this help message" ) | |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN) | |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
if not update.message or not update.message.text: return | |
url = update.message.text.strip(); user = update.effective_user | |
if not user: return | |
# Basic URL validation | |
if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]: | |
logger.debug(f"Ignoring non-URL message from {user.id}") | |
# Optional: Reply if it looks like they tried to send something else? | |
# await update.message.reply_text("Please send a valid website URL (starting with http:// or https://) or a YouTube link.") | |
return | |
logger.info(f"User {user.id} sent potential URL: {url}") | |
context.user_data['url_to_summarize'] = url | |
context.user_data['original_message_id'] = update.message.message_id # Store original message ID if needed later | |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]] | |
reply_markup = InlineKeyboardMarkup(keyboard) | |
await update.message.reply_text( f"Okay, I see this link:\n{url}\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True ) | |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
query = update.callback_query | |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return | |
user = query.from_user; summary_type = query.data; query_id = query.id | |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}") | |
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) # Log but continue | |
url = context.user_data.get('url_to_summarize') | |
message_id_to_edit = query.message.message_id # The message with the buttons | |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}") | |
if not url: | |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Might be an old button.") | |
try: await query.edit_message_text(text="Sorry, I couldn't find the URL associated with this request. Please send the link again.") | |
except Exception as e: | |
logger.error(f"Failed to edit 'URL not found' message: {e}") | |
# Attempt to send a new message as a fallback | |
try: await context.bot.send_message(chat_id=user.id, text="Sorry, the context for your previous request seems to have expired. Please send the link again.") | |
except Exception as send_e: logger.error(f"Failed even to send new message about lost context: {send_e}") | |
return | |
# Clear context *after* checking it exists | |
context.user_data.pop('url_to_summarize', None) | |
context.user_data.pop('original_message_id', None) # Clear original ID too | |
logger.debug(f"Cleared URL context for user {user.id}") | |
# Check critical configurations before scheduling task | |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled | |
if not TELEGRAM_TOKEN: | |
logger.critical("TELEGRAM_TOKEN missing! Cannot schedule task.") | |
try: await query.edit_message_text(text="❌ Bot configuration error (Token). Task cannot be started.") | |
except Exception: pass # Ignore if edit fails | |
return | |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled: | |
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid! Cannot schedule task.") | |
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models available. Task cannot be started.") | |
except Exception: pass # Ignore if edit fails | |
return | |
elif not _gemini_primary_enabled: | |
logger.warning("Primary AI (Gemini) is unavailable, will rely on fallback for summarization.") | |
# User will be notified by generate_summary if fallback also fails | |
elif not _openrouter_fallback_enabled: | |
logger.warning("Fallback AI (OpenRouter) is unavailable for summarization.") | |
# User will be notified by generate_summary if primary fails | |
# Schedule the background task | |
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}") | |
asyncio.create_task( | |
process_summary_task( | |
user_id=user.id, | |
chat_id=query.message.chat_id, | |
message_id_to_edit=message_id_to_edit, # Pass the button message ID to edit/delete | |
url=url, | |
summary_type=summary_type, | |
bot_token=TELEGRAM_TOKEN | |
), | |
name=f"SummaryTask-{user.id}-{message_id_to_edit}" | |
) | |
# Don't edit the message here; the task will handle it immediately. | |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: | |
# Ignore specific, known errors if necessary | |
ignore_errors = (AttributeError, ) # Example | |
if isinstance(context.error, ignore_errors) and "object has no attribute" in str(context.error): | |
logger.debug(f"Ignoring known/handled error in error_handler: {context.error}") | |
return | |
logger.error("Exception while handling an update:", exc_info=context.error) | |
# Optionally, try to inform the user if it's a direct message context | |
# if isinstance(update, Update) and update.effective_chat: | |
# try: await context.bot.send_message(chat_id=update.effective_chat.id, text="An internal error occurred.") | |
# except Exception: logger.error("Failed to send error message to user.") | |
# --- Application Setup (Unchanged) --- | |
async def setup_bot_config() -> Application: | |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN | |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.") | |
# Configure HTTPX client for PTB | |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 ) | |
application = ( Application.builder() .token(TELEGRAM_TOKEN) .request(custom_request) .build() ) | |
# Add handlers | |
application.add_handler(CommandHandler("start", start)) | |
application.add_handler(CommandHandler("help", help_command)) | |
# Message handler for potential URLs (non-command text) | |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url)) | |
# Callback handler for summary type buttons | |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback)) | |
# Error handler | |
application.add_error_handler(error_handler) | |
logger.info("Telegram application handlers configured."); return application | |
# --- ASGI Lifespan & Webhook (Unchanged) --- | |
async def lifespan(app: Starlette): | |
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN | |
logger.info("ASGI Lifespan: Startup initiated..."); | |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.") | |
try: | |
ptb_app = await setup_bot_config(); await ptb_app.initialize(); bot_info = await ptb_app.bot.get_me(); logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})") | |
# Webhook setup logic | |
current_webhook_info = await ptb_app.bot.get_webhook_info() | |
if current_webhook_info and current_webhook_info.url: | |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...") | |
try: | |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.") | |
else: logger.warning("Failed delete webhook (API returned False).") | |
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1) # Brief pause before setting new one | |
space_host = os.environ.get("SPACE_HOST") # Provided by Hugging Face Spaces | |
webhook_path = "/webhook" # Matches the Starlette route below | |
full_webhook_url = None | |
if space_host: | |
protocol = "https" | |
host = space_host.split('://')[-1] # Remove potential protocol prefix from env var | |
full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}" | |
if full_webhook_url: | |
logger.info(f"Setting webhook: {full_webhook_url}") | |
set_webhook_args: Dict[str, Any] = { | |
"url": full_webhook_url, | |
"allowed_updates": Update.ALL_TYPES, | |
"drop_pending_updates": True | |
} | |
if WEBHOOK_SECRET: | |
set_webhook_args["secret_token"] = WEBHOOK_SECRET | |
logger.info("Using webhook secret token.") | |
await asyncio.sleep(1.0) # Short delay before setting webhook | |
try: | |
await ptb_app.bot.set_webhook(**set_webhook_args) | |
webhook_info = await ptb_app.bot.get_webhook_info() # Verify | |
if webhook_info.url == full_webhook_url: | |
logger.info(f"Webhook set successfully: URL='{webhook_info.url}', Secret Configured={bool(WEBHOOK_SECRET)}") | |
else: | |
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'. Check SPACE_HOST env var.") | |
raise RuntimeError("Webhook URL mismatch after setting.") | |
await ptb_app.start() # Start listening for updates via webhook | |
logger.info("PTB Application started (webhook mode).") | |
except Exception as e: | |
logger.critical(f"FATAL: Failed to set webhook: {e}", exc_info=True) | |
raise RuntimeError(f"Failed to set webhook: {e}") from e | |
else: | |
logger.critical("Could not construct webhook URL. SPACE_HOST environment variable might be missing or invalid.") | |
raise RuntimeError("Webhook URL could not be determined.") | |
logger.info("ASGI Lifespan: Startup complete."); yield # Application runs here | |
except Exception as startup_err: | |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True) | |
# Ensure cleanup even if startup fails midway | |
if ptb_app: | |
if ptb_app.running: await ptb_app.stop() | |
await ptb_app.shutdown() | |
raise # Reraise the exception to stop the ASGI server | |
finally: # Shutdown phase | |
logger.info("ASGI Lifespan: Shutdown initiated...") | |
if ptb_app: | |
if ptb_app.running: | |
logger.info("Stopping PTB application...") | |
await ptb_app.stop() | |
logger.info("Shutting down PTB application...") | |
await ptb_app.shutdown() | |
logger.info("PTB Application shut down.") | |
else: | |
logger.info("PTB application was not initialized or failed during startup.") | |
logger.info("ASGI Lifespan: Shutdown complete.") | |
async def health_check(request: Request) -> PlainTextResponse: | |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _bs4_fallback_web_enabled, _urltotext_fallback_web_enabled | |
bot_status = "Not Initialized" | |
if ptb_app and ptb_app.bot: | |
try: | |
if ptb_app.running: | |
bot_info = await ptb_app.bot.get_me() | |
bot_status = f"Running (@{bot_info.username})" | |
else: bot_status = "Initialized/Not running" | |
except Exception as e: bot_status = f"Error checking status: {e}" | |
web_status = f"Web Primary: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED'}\n" \ | |
f"Web Fallback 1: {'BS4' if _bs4_fallback_web_enabled else 'DISABLED'}\n" \ | |
f"Web Fallback 2: {'API' if _urltotext_fallback_web_enabled else 'DISABLED'}" | |
summary_status = f"Summarizer Primary: {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}\n" \ | |
f"Summarizer Fallback: {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}" | |
yt_status = f"YT Fallback 1: {'Supadata' if SUPADATA_API_KEY else 'DISABLED'}\n" \ | |
f"YT Fallback 2: {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED'}" | |
return PlainTextResponse( f"TG Bot Summariser - Status: {bot_status}\n\n" | |
f"{web_status}\n\n" | |
f"{summary_status}\n\n" | |
f"{yt_status}" ) | |
async def telegram_webhook(request: Request) -> Response: | |
global WEBHOOK_SECRET | |
if not ptb_app: | |
logger.error("Webhook received but PTB application is not initialized.") | |
return PlainTextResponse('Bot not initialized', status_code=503) # Service Unavailable | |
if not ptb_app.running: | |
logger.warning("Webhook received but PTB application is not running.") | |
return PlainTextResponse('Bot not running', status_code=503) # Service Unavailable | |
try: | |
# Validate secret token if configured | |
if WEBHOOK_SECRET: | |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token") | |
if token_header != WEBHOOK_SECRET: | |
logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'") | |
return Response(content="Invalid secret token", status_code=403) # Forbidden | |
# Process the update | |
update_data = await request.json() | |
update = Update.de_json(data=update_data, bot=ptb_app.bot) | |
logger.debug(f"Processing update_id: {update.update_id} via webhook") | |
await ptb_app.process_update(update) | |
return Response(status_code=200) # OK - Tell Telegram we received it | |
except json.JSONDecodeError: | |
logger.error("Webhook received invalid JSON payload.") | |
return PlainTextResponse('Bad Request: Invalid JSON', status_code=400) | |
except Exception as e: | |
logger.error(f"Error processing webhook update: {e}", exc_info=True) | |
# Still return OK to Telegram to prevent retries for potentially poison-pill updates, | |
# but log the error for debugging. | |
return Response(status_code=200) | |
# --- ASGI App Definition (Unchanged) --- | |
app = Starlette( | |
debug=False, # Keep debug False in production | |
lifespan=lifespan, | |
routes=[ | |
Route("/", endpoint=health_check, methods=["GET"]), | |
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), | |
] | |
) | |
logger.info("Starlette ASGI application created with health check and webhook routes.") | |
# --- Direct Run (for local testing, unchanged) --- | |
if __name__ == '__main__': | |
import uvicorn | |
logger.warning("Running in development mode using Uvicorn directly (not for production)") | |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower() | |
# Use PORT from env var (like HF Spaces provides) or default to 8080 for local dev | |
local_port = int(os.environ.get('PORT', 8080)) | |
# Disable webhook setup for local Uvicorn run if needed (manual polling instead) | |
# You might need to comment out the webhook setting logic in lifespan for local testing | |
# or run with ngrok/similar and set SPACE_HOST manually. | |
# For simplicity, assuming webhook setup will just log errors if SPACE_HOST isn't set locally. | |
uvicorn.run("__main__:app", host='0.0.0.0', port=local_port, log_level=log_level, reload=True) |