Spaces:
Running
Running
# main.py (Revised for Hugging Face - Corrected Logs & Added Debugging) | |
import os | |
import re | |
import logging | |
import asyncio | |
import json | |
from flask import Flask, request, Response # For web server | |
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup | |
from telegram.ext import ( | |
Application, | |
CommandHandler, | |
MessageHandler, | |
filters, | |
ContextTypes, | |
CallbackQueryHandler | |
) | |
from telegram.constants import ParseMode # Import ParseMode explicitly | |
# Import specific libraries (Ensure these are covered in requirements.txt) | |
from youtube_transcript_api import YouTubeTranscriptApi | |
import requests | |
from bs4 import BeautifulSoup | |
# Only import ApifyClient if you might use it (i.e., have the secret) | |
# Check environment variable existence *before* conditional import | |
_apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN')) | |
if _apify_token_exists: | |
from apify_client import ApifyClient | |
else: | |
ApifyClient = None # Define it as None if not used, to avoid errors later | |
# Apply nest_asyncio early, can help prevent event loop conflicts in web frameworks | |
import nest_asyncio | |
nest_asyncio.apply() | |
# --- Logging Setup --- | |
logging.basicConfig( | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
level=logging.INFO # Set to INFO to see processing steps, DEBUG for finer detail | |
) | |
# Reduce noise from libraries | |
logging.getLogger("httpx").setLevel(logging.WARNING) | |
if ApifyClient: # Only set level if imported | |
logging.getLogger("apify_client").setLevel(logging.WARNING) | |
logging.getLogger("telegram.ext").setLevel(logging.INFO) | |
logging.getLogger('telegram.bot').setLevel(logging.INFO) | |
logging.getLogger("urllib3").setLevel(logging.INFO) # From requests | |
logging.getLogger('gunicorn.error').setLevel(logging.INFO) # Gunicorn logs | |
logger = logging.getLogger(__name__) | |
logger.info("Logging configured.") | |
# --- Environment Variable Loading & Debugging --- | |
logger.info("Attempting to load secrets from environment variables...") | |
def get_secret(secret_name): | |
"""Reads secret and logs the attempt and result.""" | |
logger.debug(f"Attempting to read secret: {secret_name}") | |
value = os.environ.get(secret_name) | |
if value: | |
logger.info(f"Secret '{secret_name}': Found (Value length: {len(value)})") # Log length, not value itself | |
else: | |
logger.warning(f"Secret '{secret_name}': Not Found") | |
return value | |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN') | |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') | |
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Will be None if not set in Secrets | |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # Will be None if not set in Secrets | |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # Will be None if not set in Secrets | |
logger.info("Secret loading attempt finished.") | |
# --- Initial Validation --- | |
# Perform validation *after* attempting to load all secrets | |
if not TELEGRAM_TOKEN: | |
logger.critical("FATAL: TELEGRAM_TOKEN environment variable not found or empty!") | |
# In a deployed environment, exiting might not be helpful. Log critical error. | |
# exit("Telegram Token Missing") | |
else: | |
logger.info("TELEGRAM_TOKEN seems present.") | |
if not OPENROUTER_API_KEY: | |
# Log error but allow running, summaries will just fail gracefully later | |
logger.error("OPENROUTER_API_KEY not found or empty! Summarization will fail.") | |
else: | |
logger.info("OPENROUTER_API_KEY seems present.") | |
# (Optional checks log warnings if keys were not found by get_secret) | |
# --- Bot Logic Functions (Simplified Version - No Crawl4AI) --- | |
# (Functions: is_youtube_url, extract_youtube_id, get_transcript_via_supadata, | |
# get_transcript_via_apify, get_youtube_transcript, get_website_content_via_requests, | |
# get_website_content_via_urltotext_api, generate_summary remain EXACTLY THE SAME | |
# as in the previous complete main.py code block. Ensure they are included here.) | |
# --- [PASTE ALL BOT LOGIC FUNCTIONS HERE - FROM is_youtube_url to generate_summary ] --- | |
# Helper Functions | |
def is_youtube_url(url): | |
"""Checks if the URL is a valid YouTube video or shorts URL.""" | |
youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})' | |
return bool(re.search(youtube_regex, url)) | |
def extract_youtube_id(url): | |
"""Extracts the YouTube video ID from a URL.""" | |
youtube_id_regex = r'(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([\w-]{11})' | |
match = re.search(youtube_id_regex, url) | |
if match: | |
return match.group(1) | |
logger.warning(f"Could not extract YouTube ID from URL: {url}") | |
return None | |
# Supadata Transcript Fetching | |
async def get_transcript_via_supadata(video_id: str, api_key: str): | |
"""Fetches YouTube transcript via Supadata API.""" | |
if not video_id: logger.error("[Supadata] get_transcript_via_supadata called with no video_id"); return None | |
if not api_key: logger.error("[Supadata] API key is missing."); return None # Already checked before calling | |
logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}") | |
api_endpoint = f"https://api.supadata.net/v1/youtube/transcript" | |
params = {"videoId": video_id, "format": "text"} | |
headers = {"X-API-Key": api_key} | |
try: | |
# Use asyncio.to_thread to run blocking requests.get in a separate thread | |
response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30) | |
logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data") | |
if content and isinstance(content, str): | |
logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}") | |
return content.strip() | |
else: | |
logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}") | |
return None | |
except json.JSONDecodeError: # Handle cases where API might return plain text on success | |
if response.text: | |
logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}") | |
return response.text.strip() | |
else: | |
logger.error(f"[Supadata] Failed to decode JSON response (and no text body) for {video_id}. Response: {response.text[:200]}...") | |
return None | |
except Exception as e: | |
logger.error(f"[Supadata] Error processing successful response for {video_id}: {e}", exc_info=True) | |
return None | |
elif response.status_code in [401, 403]: | |
logger.error(f"[Supadata] Authentication error ({response.status_code}). Check API key.") | |
return None # Don't retry if key is bad | |
elif response.status_code == 404: | |
logger.warning(f"[Supadata] Transcript not found ({response.status_code}) for {video_id}.") | |
return None | |
else: | |
logger.error(f"[Supadata] Unexpected status code {response.status_code} for {video_id}. Response: {response.text[:200]}...") | |
return None | |
except requests.exceptions.Timeout: | |
logger.error(f"[Supadata] Timeout error connecting to API for {video_id}") | |
return None | |
except requests.exceptions.RequestException as e: | |
logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}") | |
return None | |
except Exception as e: | |
logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True) | |
return None | |
# Apify Transcript Fetching | |
async def get_transcript_via_apify(video_url: str, api_token: str): | |
"""Fetches YouTube transcript via Apify API.""" | |
if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None | |
if not api_token: logger.error("[Apify] API token is missing."); return None # Already checked | |
if not ApifyClient: logger.error("[Apify] ApifyClient not available/imported."); return None | |
logger.info(f"[Apify] Attempting fetch for URL: {video_url}") | |
actor_id = "karamelo~youtube-transcripts" | |
api_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items" | |
params = {"token": api_token} | |
payload = json.dumps({ | |
"urls": [video_url], | |
"outputFormat": "singleStringText", | |
"maxRetries": 5, | |
"channelHandleBoolean": False, | |
"channelNameBoolean": False, | |
"datePublishedBoolean": False, | |
"relativeDateTextBoolean": False, | |
}) | |
headers = {"Content-Type": "application/json"} | |
try: | |
logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}") | |
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90) # Longer timeout for actor run | |
logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}") | |
if response.status_code == 200: | |
try: | |
results = response.json() | |
if isinstance(results, list) and len(results) > 0: | |
item = results[0] | |
content = item.get("text") or item.get("transcript") or item.get("captions_concatenated") | |
if not content and item.get("captions") and isinstance(item["captions"], list): # Handle 'captions' format if primary keys fail | |
logger.info("[Apify] Processing 'captions' format.") | |
content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text")) | |
if content and isinstance(content, str): | |
logger.info(f"[Apify] Successfully fetched transcript for {video_url}. Length: {len(content)}") | |
return content.strip() | |
else: | |
logger.warning(f"[Apify] Actor run successful but transcript content not found/empty in result for {video_url}. Result item: {item}") | |
return None | |
else: | |
logger.warning(f"[Apify] Actor run successful but dataset was empty for {video_url}. Response: {results}") | |
return None | |
except json.JSONDecodeError: | |
logger.error(f"[Apify] Failed to decode JSON response for {video_url}. Status: {response.status_code}. Response text: {response.text[:200]}...") | |
return None | |
except Exception as e: | |
logger.error(f"[Apify] Error processing successful response for {video_url}: {e}", exc_info=True) | |
return None | |
elif response.status_code == 400: | |
logger.error(f"[Apify] Bad Request (400) for {video_url}. Check input payload. Response: {response.text[:200]}...") | |
return None | |
elif response.status_code == 401: | |
logger.error("[Apify] Authentication error (401). Check API token.") | |
return None # Don't retry if token is bad | |
else: | |
logger.error(f"[Apify] Unexpected status code {response.status_code} for {video_url}. Response: {response.text[:200]}...") | |
return None | |
except requests.exceptions.Timeout: | |
logger.error(f"[Apify] Timeout error running actor for {video_url}") | |
return None | |
except requests.exceptions.RequestException as e: | |
logger.error(f"[Apify] Request error running actor for {video_url}: {e}") | |
return None | |
except Exception as e: | |
logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True) | |
return None | |
# Combined YouTube Transcript Function (with Fallbacks) | |
async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None): | |
"""Fetches YouTube transcript using library, then Supadata, then Apify.""" | |
if not video_id: logger.error("get_youtube_transcript called with no video_id"); return None | |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})") | |
transcript_text = None | |
# 1. Primary Method: youtube-transcript-api | |
logger.info("[Primary YT] Attempting youtube-transcript-api...") | |
try: | |
# Run synchronous library call in a thread | |
transcript_list = await asyncio.to_thread( | |
YouTubeTranscriptApi.get_transcript, | |
video_id, | |
languages=['en', 'en-GB', 'en-US'] # Prioritize English variations | |
) | |
if transcript_list: | |
transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item]) | |
if transcript_text: | |
logger.info(f"[Primary YT] Successfully fetched transcript via library for {video_id} (length: {len(transcript_text)})") | |
return transcript_text # Return immediately on success | |
else: | |
logger.warning(f"[Primary YT] Joined transcript text is empty for {video_id}") | |
transcript_text = None # Ensure it's None if empty after join | |
else: | |
logger.warning(f"[Primary YT] Transcript list empty for {video_id}") | |
transcript_text = None | |
except Exception as e: | |
logger.warning(f"[Primary YT] Error getting transcript via library for {video_id}: {e}") | |
if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found for {video_id}. May be unavailable/private.") | |
elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled for {video_id}.") | |
transcript_text = None # Ensure it's None on error | |
# 2. Fallback 1: Supadata API | |
if transcript_text is None: | |
logger.info("[Fallback YT 1] Primary method failed. Trying Supadata API...") | |
if supadata_key: | |
transcript_text = await get_transcript_via_supadata(video_id, supadata_key) | |
if transcript_text: | |
logger.info(f"[Fallback YT 1] Successfully fetched transcript via Supadata for {video_id}") | |
return transcript_text # Return on success | |
else: | |
logger.warning(f"[Fallback YT 1] Supadata API failed or returned no content for {video_id}.") | |
else: | |
logger.warning("[Fallback YT 1] Supadata API key not available. Skipping.") | |
# 3. Fallback 2: Apify API | |
if transcript_text is None: | |
logger.info("[Fallback YT 2] Primary & Supadata failed. Trying Apify API...") | |
if apify_token: | |
transcript_text = await get_transcript_via_apify(video_url, apify_token) | |
if transcript_text: | |
logger.info(f"[Fallback YT 2] Successfully fetched transcript via Apify for {video_url}") | |
return transcript_text # Return on success | |
else: | |
logger.warning(f"[Fallback YT 2] Apify API failed or returned no content for {video_url}.") | |
else: | |
logger.warning("[Fallback YT 2] Apify API token not available. Skipping.") | |
# If all methods failed | |
if transcript_text is None: | |
logger.error(f"All methods failed to fetch transcript for video ID: {video_id}") | |
return None | |
# Should not be reached if logic above is correct, but as a safeguard | |
return transcript_text | |
# Website Content via Requests/BS4 (Primary Method for Simplified Bot) | |
async def get_website_content_via_requests(url): | |
"""Attempts to scrape website content using requests/BeautifulSoup (Primary Method).""" | |
if not url: logger.error("[Web Scraper - Requests/BS4] called with no URL"); return None | |
logger.info(f"[Web Scraper - Requests/BS4] Fetching website content for: {url}") | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', # Updated UA | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.9', | |
'Connection': 'keep-alive', | |
'DNT': '1', # Do Not Track | |
'Upgrade-Insecure-Requests': '1' | |
} | |
logger.debug(f"[Web Scraper - Requests/BS4] Sending request to {url}") | |
# Run blocking I/O in a separate thread | |
response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True) | |
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) | |
logger.debug(f"[Web Scraper - Requests/BS4] Received response {response.status_code} from {url}") | |
content_type = response.headers.get('content-type', '').lower() | |
if 'html' not in content_type: | |
logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received from {url}: {content_type}") | |
return None # Don't try to parse non-html | |
# Use html.parser, it's built-in | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Remove common unwanted tags more aggressively | |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]): | |
element.extract() | |
# Try finding common main content containers | |
main_content = soup.find('main') or \ | |
soup.find('article') or \ | |
soup.find(id='content') or \ | |
soup.find(class_='content') or \ | |
soup.find(id='main-content') or \ | |
soup.find(class_='main-content') or \ | |
soup.find(role='main') | |
# Fallback to body if no specific container found | |
target_element = main_content if main_content else soup.body | |
if not target_element: | |
logger.warning(f"[Web Scraper - Requests/BS4] Could not find body or main content container for parsing {url}") | |
return None # Nothing to parse | |
# Get text, joining lines smartly | |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()] | |
text = "\n".join(lines) # Join with newlines to preserve some structure | |
# Basic length check | |
if not text or len(text) < 50: # Arbitrary short length check | |
logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is very short or empty after cleaning for {url} (Length: {len(text)})") | |
# Consider returning None if too short, depends on use case | |
# return None | |
logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped content for {url} (final length: {len(text)})") | |
return text | |
except requests.exceptions.Timeout: | |
logger.error(f"[Web Scraper - Requests/BS4] Timeout error scraping website: {url}") | |
return None | |
except requests.exceptions.TooManyRedirects: | |
logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error scraping website: {url}") | |
return None | |
except requests.exceptions.RequestException as e: | |
# This catches ConnectTimeout, HTTPError, ConnectionError etc. | |
logger.error(f"[Web Scraper - Requests/BS4] Request error scraping website {url}: {e}") | |
return None | |
except Exception as e: | |
# Catch-all for unexpected errors during parsing etc. | |
logger.error(f"[Web Scraper - Requests/BS4] Error scraping or parsing website {url}: {e}", exc_info=True) | |
return None | |
# Website Content via URLToText API (Fallback Method) | |
async def get_website_content_via_urltotext_api(url: str, api_key: str): | |
"""Fetches website content using the URLToText API (Fallback).""" | |
if not url: logger.error("[Web Scraper - URLToText API] called with no URL"); return None | |
if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None # Already checked | |
logger.info(f"[Web Scraper - URLToText API] Attempting to fetch content for: {url}") | |
api_endpoint = "https://urltotext.com/api/v1/urltotext/" | |
payload = json.dumps({ | |
"url": url, | |
"output_format": "text", | |
"extract_main_content": True, | |
"render_javascript": True, # Often needed for modern sites | |
"residential_proxy": False, # Start with standard | |
}) | |
headers = { | |
"Authorization": f"Token {api_key}", | |
"Content-Type": "application/json" | |
} | |
try: | |
logger.debug(f"[Web Scraper - URLToText API] Sending request for {url}") | |
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=45) | |
logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
content = data.get("data", {}).get("content") | |
credits = data.get("credits_used", "N/A") | |
warning = data.get("data", {}).get("warning") | |
if warning: logger.warning(f"[Web Scraper - URLToText API] Warning for {url}: {warning}") | |
if content: | |
logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API for {url}. Length: {len(content)}. Credits: {credits}") | |
return content.strip() | |
else: | |
logger.warning(f"[Web Scraper - URLToText API] API returned success but content was empty for {url}. Response: {data}") | |
return None | |
except json.JSONDecodeError: | |
logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response for {url}. Response: {response.text[:500]}...") | |
return None | |
except Exception as e: | |
logger.error(f"[Web Scraper - URLToText API] Error processing successful API response for {url}: {e}", exc_info=True) | |
return None | |
elif response.status_code in [400, 402, 422, 500]: # Known client/server errors | |
logger.error(f"[Web Scraper - URLToText API] Error {response.status_code} from API for {url}. Response: {response.text[:200]}...") | |
return None | |
else: # Other unexpected codes | |
logger.error(f"[Web Scraper - URLToText API] Unexpected status {response.status_code} from API for {url}. Response: {response.text[:200]}...") | |
return None | |
except requests.exceptions.Timeout: | |
logger.error(f"[Web Scraper - URLToText API] Timeout error connecting to API for {url}") | |
return None | |
except requests.exceptions.RequestException as e: | |
logger.error(f"[Web Scraper - URLToText API] Request error connecting to API for {url}: {e}") | |
return None | |
except Exception as e: | |
logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call for {url}: {e}", exc_info=True) | |
return None | |
# DeepSeek Summary Function (via OpenRouter) | |
async def generate_summary(text: str, summary_type: str, api_key: str) -> str: | |
"""Generates summary using DeepSeek via OpenRouter API.""" | |
logger.info(f"Generating {summary_type} summary using DeepSeek/OpenRouter. Input text length: {len(text)}") | |
if not api_key: | |
logger.error("OpenRouter API key was not provided to generate_summary.") | |
return "Error: AI model configuration key (OpenRouter) is missing." | |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions" | |
# Check OpenRouter docs for the latest recommended free/low-cost models | |
model_name = "deepseek/deepseek-chat:free" | |
if summary_type == "paragraph": | |
prompt = "You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be: • Clear and simple language suitable for someone unfamiliar with the topic. • Uses British English spellings throughout. • Straightforward and understandable vocabulary; avoid complex terms. • Presented as ONE SINGLE PARAGRAPH. • No more than 85 words maximum; but does not have to be exactly 85. • Considers the entire text content equally. • Uses semicolons (;) instead of em dashes (– or —). Here is the text to summarise:" | |
else: # points summary | |
prompt = """You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this Markdown format: | |
• For each distinct topic or section identified in the text, create a heading. | |
• Each heading MUST be enclosed in double asterisks for bolding (e.g., **Section Title**). | |
• Immediately following each heading, list the key points as a bulleted list. | |
• Each bullet point MUST start with a hyphen and a space (`- `) on a new line. | |
• The text within each bullet point should NOT contain any bold formatting. | |
• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic. | |
• Use British English spellings throughout. | |
• Avoid overly complex or advanced vocabulary. | |
• Keep bullet points concise. | |
• Ensure the entire summary takes no more than two minutes to read. | |
• Consider the entire text's content, not just the beginning or a few topics. | |
• Use semicolons (;) instead of em dashes (– or —). | |
Here is the text to summarise:""" | |
MAX_INPUT_LENGTH = 500000 # Truncate long inputs to avoid high costs/errors | |
if len(text) > MAX_INPUT_LENGTH: | |
logger.warning(f"Input text length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating.") | |
text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)" | |
full_prompt = f"{prompt}\n\n{text}" | |
headers = { | |
"Authorization": f"Bearer {api_key}", | |
"Content-Type": "application/json", | |
# Recommended headers for OpenRouter identification | |
"HTTP-Referer": "https://huggingface.co/spaces/", # Identify source as HF Space | |
"X-Title": "Telegram Summary Bot (HF Space)", # Identify app | |
} | |
payload = json.dumps({ | |
"model": model_name, | |
"messages": [ | |
{"role": "user", "content": full_prompt} | |
], | |
# Optional: Add max_tokens if needed, check model defaults | |
# "max_tokens": 1024, | |
}) | |
try: | |
logger.debug(f"Sending request to OpenRouter ({model_name})...") | |
# Run blocking request in thread | |
response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=60) | |
logger.debug(f"Received status code {response.status_code} from OpenRouter.") | |
if response.status_code == 200: | |
try: | |
data = response.json() | |
if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0: | |
message = data["choices"][0].get("message") | |
if message and isinstance(message, dict): | |
summary = message.get("content") | |
if summary: | |
logger.info(f"Successfully generated summary via OpenRouter. Output length: {len(summary)}") | |
return summary.strip() | |
else: | |
logger.warning(f"OpenRouter response successful, but content was empty. Response: {data}") | |
return "Sorry, the AI model returned an empty summary." | |
else: | |
logger.error(f"Unexpected message structure in OpenRouter response: {message}. Full response: {data}") | |
return "Sorry, could not parse the AI model's response (unexpected format)." | |
else: | |
# Handle cases like moderation flags, empty choices list | |
if data.get("error"): logger.error(f"OpenRouter API Error: {data['error']}") | |
else: logger.error(f"Unexpected choices structure in OpenRouter response: {data.get('choices')}. Full response: {data}") | |
return "Sorry, could not parse the AI model's response (choices missing/invalid or API error)." | |
except json.JSONDecodeError: | |
logger.error(f"Failed to decode JSON response from OpenRouter. Status: {response.status_code}. Response text: {response.text[:500]}...") | |
return "Sorry, failed to understand the response from the AI model." | |
except Exception as e: | |
logger.error(f"Error processing successful OpenRouter response: {e}", exc_info=True) | |
return "Sorry, an error occurred while processing the AI model's response." | |
elif response.status_code == 401: | |
logger.error("OpenRouter API key is invalid (401 Unauthorized). Check HF Space Secrets.") | |
return "Error: The AI model configuration key (OpenRouter) is invalid." | |
elif response.status_code == 402: | |
logger.error("OpenRouter Payment Required (402). Check credits/limits on OpenRouter.") | |
return "Sorry, there might be an issue with the AI model service limits or payment. Please try again later or check OpenRouter account." | |
elif response.status_code == 429: | |
logger.warning("OpenRouter Rate Limit Exceeded (429).") | |
return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment." | |
elif response.status_code == 500: | |
logger.error(f"OpenRouter Internal Server Error (500). Response: {response.text[:500]}...") | |
return "Sorry, the AI model service encountered an internal error. Please try again later." | |
else: | |
# Handle other potential errors (e.g., 400 Bad Request, 404 Not Found for model) | |
logger.error(f"Unexpected status code {response.status_code} from OpenRouter. Response: {response.text[:500]}...") | |
try: # Try to parse error message from response body | |
error_data = response.json() | |
error_msg = error_data.get("error", {}).get("message", response.text[:100]) | |
return f"Sorry, the AI model service returned an error ({response.status_code}): {error_msg}" | |
except: # Fallback if parsing fails | |
return f"Sorry, the AI model service returned an unexpected status ({response.status_code})." | |
except requests.exceptions.Timeout: | |
logger.error("Timeout error connecting to OpenRouter API.") | |
return "Sorry, the request to the AI model timed out. Please try again." | |
except requests.exceptions.RequestException as e: | |
logger.error(f"Request error connecting to OpenRouter API: {e}") | |
return "Sorry, there was an error connecting to the AI model service." | |
except Exception as e: | |
logger.error(f"Unexpected error in generate_summary (OpenRouter): {e}", exc_info=True) | |
return "Sorry, an unexpected error occurred while generating the summary." | |
# --- Telegram Bot Handlers (Command, Message, CallbackQuery) --- | |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Sends a welcome message when the /start command is issued.""" | |
user = update.effective_user | |
logger.info(f"User {user.id} ({user.username or 'NoUsername'}) used /start.") | |
mention = user.mention_html() if user.username else user.first_name | |
await update.message.reply_html( | |
f"👋 Hello {mention}! I can summarize YouTube links or website URLs.\n\n" | |
"Just send me a link anytime!", | |
# Optional: disable_web_page_preview=True | |
) | |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Sends a help message when the /help command is issued.""" | |
logger.info(f"User {update.effective_user.id} used /help.") | |
await update.message.reply_text( | |
"🔍 **How to use this bot:**\n\n" | |
"1. Send me any YouTube video link or website URL.\n" | |
"2. I'll ask how you want it summarized (paragraph or points).\n" | |
"3. Click the button for your choice.\n" | |
"4. Wait for the summary!\n\n" | |
"I use multiple methods if the first fails (especially for YT transcripts & website content).\n\n" | |
"**Commands:**\n" | |
"/start - Display welcome message\n" | |
"/help - Show this help message", | |
parse_mode=ParseMode.MARKDOWN | |
) | |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Handles text messages, checks for URLs, and asks for summary type.""" | |
if not update.message or not update.message.text: return # Ignore empty messages | |
url = update.message.text.strip() | |
user = update.effective_user | |
logger.info(f"User {user.id} ({user.username or 'NoUsername'}) sent potential URL: {url}") | |
# Basic URL validation | |
if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]: | |
logger.debug(f"Ignoring non-URL message from user {user.id}: {url}") | |
# Optional: Reply if you want to guide the user | |
# await update.message.reply_text("Please send a valid URL starting with http:// or https://") | |
return | |
# Store URL in user_data (simple state management) | |
context.user_data['url_to_summarize'] = url | |
logger.debug(f"Stored URL '{url}' for user {user.id} in user_data") | |
# Ask for summary type with Inline Keyboard | |
keyboard = [ | |
[ | |
InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), | |
InlineKeyboardButton("Points Summary", callback_data="points") | |
] | |
] | |
reply_markup = InlineKeyboardMarkup(keyboard) | |
await update.message.reply_text( | |
f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?", | |
reply_markup=reply_markup, | |
disable_web_page_preview=True | |
) | |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Handles button presses for summary type selection.""" | |
query = update.callback_query | |
if not query: return | |
await query.answer() # Acknowledge button press immediately | |
summary_type = query.data | |
user = update.effective_user or query.from_user # Get user info | |
url = context.user_data.get('url_to_summarize', None) # Retrieve stored URL | |
logger.info(f"User {user.id} chose '{summary_type}' summary. Checking for URL '{url}' in context.") | |
# Check if URL is still in context (it might expire or be lost) | |
if not url: | |
logger.warning(f"User {user.id} pressed button, but NO URL found in user_data context.") | |
try: | |
# Edit the message where the button was, informing the user | |
await query.edit_message_text(text="Sorry, I couldn't find the URL associated with this request. Please send the link again.") | |
except Exception as edit_err: | |
# If editing fails (e.g., message too old), log it but don't crash | |
logger.warning(f"Failed to edit message for missing URL context: {edit_err}") | |
# Maybe send a new message as a fallback? Depends on desired behavior. | |
# await context.bot.send_message(chat_id=user.id, text="Sorry, context lost. Please send link again.") | |
return # Stop processing if URL is missing | |
# Clear the URL from context now that we're processing it | |
context.user_data.pop('url_to_summarize', None) | |
logger.debug(f"Retrieved and cleared URL {url} from user_data for user {user.id}") | |
# --- Get API Keys (Read fresh from environment - cheap operation) --- | |
# This ensures if secrets are updated in HF UI, the next request uses them | |
logger.debug("Reading API keys from environment variables within handler...") | |
current_openrouter_key = os.environ.get('OPENROUTER_API_KEY') | |
current_urltotext_key = os.environ.get('URLTOTEXT_API_KEY') | |
current_supadata_key = os.environ.get('SUPADATA_API_KEY') | |
current_apify_token = os.environ.get('APIFY_API_TOKEN') | |
logger.debug(f"Keys read: OpenRouter={'Yes' if current_openrouter_key else 'No'}, URLToText={'Yes' if current_urltotext_key else 'No'}, Supadata={'Yes' if current_supadata_key else 'No'}, Apify={'Yes' if current_apify_token else 'No'}") | |
# Check *essential* key for summarization | |
if not current_openrouter_key: | |
logger.error("OpenRouter API key missing in handler. Cannot generate summary.") | |
# Inform user and clean up the button message | |
await context.bot.send_message(chat_id=user.id, text="Error: AI model configuration key (OpenRouter) is missing. Cannot generate summary.") | |
try: await query.delete_message() # Delete the message with buttons | |
except Exception: pass | |
return | |
# --- Start Processing --- | |
processing_message = f"Got it! Generating '{summary_type}' summary for:\n{url}\n\nThis might take a moment..." | |
message_to_delete_later = None # In case editing fails | |
try: | |
# Edit the message to show processing status | |
await query.edit_message_text(processing_message) | |
except Exception as e: | |
# If editing fails (e.g., message too old), send a new status message | |
logger.warning(f"Could not edit original message: {e}, sending new status message.") | |
try: | |
message_to_delete_later = await context.bot.send_message(chat_id=user.id, text=processing_message) | |
except Exception as send_err: | |
# If even sending fails, log and give up on this request | |
logger.error(f"Failed to send status message after edit failure: {send_err}") | |
return | |
content = None | |
user_feedback_message = None # Stores error messages for the user | |
success = False | |
is_youtube = is_youtube_url(url) | |
try: | |
# Show "typing..." status in Telegram chat | |
await context.bot.send_chat_action(chat_id=user.id, action='typing') | |
# --- Content Fetching Logic --- | |
if is_youtube: | |
video_id = extract_youtube_id(url) | |
if video_id: | |
# Fetch YT transcript using the function with fallbacks | |
content = await get_youtube_transcript( | |
video_id, | |
url, # Pass full URL for Apify | |
current_supadata_key, | |
current_apify_token | |
) | |
# Set feedback message only if content fetching failed | |
user_feedback_message = None if content else "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)." | |
else: | |
user_feedback_message = "Sorry, I couldn't understand that YouTube URL format." | |
else: # Website Logic (Requests/BS4 -> URLToText API) | |
logger.info(f"Attempting website scrape (Requests/BS4) for {url}") | |
content = await get_website_content_via_requests(url) | |
if content: | |
logger.info("Primary website scraping (Requests/BS4) successful.") | |
user_feedback_message = None | |
else: | |
logger.warning(f"Primary web scraping failed for {url}. Attempting fallback API (URLToText).") | |
if current_urltotext_key: | |
await context.bot.send_chat_action(chat_id=user.id, action='typing') # Show activity for fallback | |
content = await get_website_content_via_urltotext_api(url, current_urltotext_key) | |
if content: | |
user_feedback_message = None | |
logger.info("Fallback URLToText API scraping successful.") | |
else: | |
user_feedback_message = "Sorry, I couldn't fetch content from that website using either method (blocked/inaccessible/empty?)." | |
logger.error(f"Both primary (Requests/BS4) and fallback API failed for website {url}.") | |
else: | |
# Primary failed, and fallback key is missing | |
user_feedback_message = "Sorry, I couldn't fetch content from that website (blocked/inaccessible/empty?). The fallback method is not configured." | |
logger.warning(f"Primary web scraping failed for {url}, and fallback API key (URLToText) is missing.") | |
# --- End Content Fetching --- | |
# --- Generate Summary if Content was Fetched --- | |
if content: | |
logger.info("Content fetched successfully, proceeding to generate summary.") | |
await context.bot.send_chat_action(chat_id=user.id, action='typing') | |
# Pass the OpenRouter key to the summary function | |
summary = await generate_summary(content, summary_type, current_openrouter_key) | |
# Check if summary generation returned an error message | |
if summary.startswith("Error:") or summary.startswith("Sorry,"): | |
user_feedback_message = summary # Use the error from the summary function | |
success = False | |
logger.warning(f"Summary generation failed or returned error: {summary}") | |
else: | |
# Send the successful summary | |
await context.bot.send_message( | |
chat_id=user.id, | |
text=summary, | |
parse_mode=ParseMode.MARKDOWN, | |
disable_web_page_preview=True | |
) | |
success = True | |
user_feedback_message = None # Clear any previous failure message from fetching stage | |
elif not user_feedback_message: | |
# If content is None but no specific error message was set during fetching | |
user_feedback_message = "Sorry, couldn't retrieve any content to summarize from the provided link." | |
logger.warning(f"Content fetching resulted in None for {url}, but no specific user feedback message was set.") | |
# --- Send Feedback if any step failed --- | |
if user_feedback_message and not success: | |
await context.bot.send_message(chat_id=user.id, text=user_feedback_message) | |
except Exception as e: | |
# Catch unexpected errors during the whole process | |
logger.error(f"Unexpected error during processing callback for {url}: {e}", exc_info=True) | |
try: | |
# Send a generic error message to the user | |
await context.bot.send_message(chat_id=user.id, text="Oops! Something went really wrong while processing your request. Please try again later.") | |
except Exception as final_err: | |
# If even sending the error message fails... log it. | |
logger.error(f"Failed to send final error message to user {user.id}: {final_err}") | |
finally: | |
# --- Cleanup --- | |
# Delete the "Processing..." status message or the original message with buttons | |
try: | |
if message_to_delete_later: # If we sent a separate status message | |
await context.bot.delete_message(chat_id=user.id, message_id=message_to_delete_later.message_id) | |
elif query: # Otherwise, delete the original message with the buttons | |
# We might have already edited it, but deleting ensures cleanup | |
await query.delete_message() | |
except Exception as del_e: | |
# Log if deletion fails, but don't let it stop anything | |
logger.warning(f"Could not delete status/button message: {del_e}") | |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: | |
"""Log Errors caused by Updates.""" | |
logger.error("Exception while handling an update:", exc_info=context.error) | |
# Optionally, notify the developer or user about the error | |
# developer_chat_id = 12345678 # Replace with your chat ID if desired | |
# if update and hasattr(update, 'effective_chat') and update.effective_chat: | |
# chat_id = update.effective_chat.id | |
# error_message = f"Error processing update {update.update_id} for chat {chat_id}. Error: {context.error}" | |
# else: | |
# error_message = f"Error in handler: {context.error}" | |
# try: | |
# await context.bot.send_message(chat_id=developer_chat_id, text=error_message) | |
# except Exception as e: | |
# logger.error(f"Failed to send error notification to developer: {e}") | |
# --- Initialize Telegram Bot Application --- | |
# Do this setup once when the script starts | |
logger.info("Initializing Telegram Application...") | |
if not TELEGRAM_TOKEN: # Check again, as initialization needs it | |
logger.critical("Cannot initialize PTB Application: TELEGRAM_TOKEN not found.") | |
ptb_app = None | |
else: | |
# Build PTB Application instance | |
ptb_app_builder = Application.builder().token(TELEGRAM_TOKEN) | |
# Removed concurrency settings for simplicity, PTB defaults should be okay with Flask + asyncio.create_task | |
# ptb_app_builder.concurrent_updates(True) | |
ptb_app = ptb_app_builder.build() | |
# Register handlers with the PTB application instance | |
ptb_app.add_handler(CommandHandler("start", start)) | |
ptb_app.add_handler(CommandHandler("help", help_command)) | |
ptb_app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url)) | |
ptb_app.add_handler(CallbackQueryHandler(handle_summary_type_callback)) | |
# Add the error handler | |
ptb_app.add_error_handler(error_handler) | |
logger.info("Telegram handlers registered.") | |
# --- Flask App Setup --- | |
app = Flask(__name__) # Create Flask web server instance | |
logger.info("Flask app created.") | |
# --- Webhook Endpoint --- | |
async def webhook() -> Response: | |
"""Webhook endpoint to receive updates from Telegram""" | |
logger.info("Webhook request received...") | |
if not ptb_app: # Check if PTB initialization failed | |
logger.error("Telegram application not initialized. Cannot process update.") | |
return Response('Bot not configured properly', status=500) | |
if request.is_json: | |
try: | |
update_data = request.get_json() | |
# Use PTB's built-in deserialization | |
update = Update.de_json(update_data, ptb_app.bot) | |
logger.debug(f"Processing update ID: {update.update_id}") | |
# Process the update using PTB's internal dispatcher in a background task | |
asyncio.create_task(ptb_app.process_update(update)) | |
# Respond quickly to Telegram that we received the update | |
return Response('ok', status=200) | |
except json.JSONDecodeError: | |
logger.error("Failed to decode JSON from Telegram webhook.") | |
return Response('Bad Request - Invalid JSON', status=400) | |
except Exception as e: | |
logger.error(f"Error processing update in webhook handler: {e}", exc_info=True) | |
return Response('Internal Server Error', status=500) | |
else: | |
logger.warning("Received non-JSON request to webhook endpoint.") | |
return Response('Bad Request - Expected JSON', status=400) | |
def index(): | |
"""A simple health check endpoint for the web server""" | |
logger.debug("Health check endpoint '/' accessed.") | |
# You could add more checks here if needed (e.g., check PTB status) | |
bot_status = "PTB App Initialized" if ptb_app else "PTB App FAILED Initialization" | |
return f"Hello! Telegram Bot Webhook Listener ({bot_status}) is running." | |
# --- Main Execution Block --- | |
# This part runs the Flask web server when the Docker container starts via Gunicorn | |
if __name__ == '__main__': | |
# This block might not run when deployed via Gunicorn as specified in Dockerfile CMD | |
# Gunicorn imports the 'app' object directly from 'main.py'. | |
# However, it's good practice to have for potential local testing. | |
if not ptb_app: | |
logger.critical("Aborting Flask server start (local test?) because Telegram App failed initialization.") | |
else: | |
logger.info("Starting Flask web server directly (for local testing?)...") | |
# Use a development server port like 5000 for local testing | |
port = int(os.environ.get('PORT', 5000)) # Changed default for local test | |
# Run with debug=True ONLY for local testing, NEVER in production/deployment | |
app.run(host='0.0.0.0', port=port, debug=True) |