|
|
|
|
|
import os |
|
|
import re |
|
|
import logging |
|
|
import asyncio |
|
|
import json |
|
|
import html |
|
|
import contextlib |
|
|
import traceback |
|
|
from typing import Optional, Dict, Any |
|
|
|
|
|
|
|
|
from starlette.applications import Starlette |
|
|
from starlette.routing import Route |
|
|
from starlette.responses import PlainTextResponse, JSONResponse, Response |
|
|
from starlette.requests import Request |
|
|
|
|
|
|
|
|
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot |
|
|
from telegram.ext import ( |
|
|
Application, |
|
|
CommandHandler, |
|
|
MessageHandler, |
|
|
filters, |
|
|
ContextTypes, |
|
|
CallbackQueryHandler, |
|
|
) |
|
|
from telegram.constants import ParseMode |
|
|
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError |
|
|
from telegram.request import HTTPXRequest, BaseRequest |
|
|
|
|
|
|
|
|
import httpx |
|
|
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound |
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup |
|
|
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log |
|
|
|
|
|
try: |
|
|
import lxml |
|
|
DEFAULT_PARSER = 'lxml' |
|
|
except ImportError: |
|
|
DEFAULT_PARSER = 'html.parser' |
|
|
|
|
|
|
|
|
_apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN')) |
|
|
if _apify_token_exists: |
|
|
from apify_client import ApifyClient |
|
|
from apify_client.consts import ActorJobStatus |
|
|
from apify_client.errors import ApifyApiError |
|
|
else: |
|
|
ApifyClient = None |
|
|
ApifyApiError = None |
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
|
level=logging.INFO |
|
|
) |
|
|
logging.getLogger("httpx").setLevel(logging.WARNING) |
|
|
if ApifyClient: logging.getLogger("apify_client").setLevel(logging.WARNING) |
|
|
logging.getLogger("telegram.ext").setLevel(logging.INFO) |
|
|
logging.getLogger('telegram.bot').setLevel(logging.INFO) |
|
|
logging.getLogger("urllib3").setLevel(logging.INFO) |
|
|
logging.getLogger('gunicorn.error').setLevel(logging.INFO) |
|
|
logging.getLogger('uvicorn').setLevel(logging.INFO) |
|
|
logging.getLogger('starlette').setLevel(logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}") |
|
|
|
|
|
|
|
|
ptb_app: Optional[Application] = None |
|
|
|
|
|
|
|
|
logger.info("Attempting to load secrets...") |
|
|
def get_secret(secret_name): |
|
|
value = os.environ.get(secret_name) |
|
|
|
|
|
log_length = min(len(value), 8) if value else 0 |
|
|
status = "Found" if value else "Not Found" |
|
|
logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value[:log_length]}...)") |
|
|
return value |
|
|
|
|
|
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN') |
|
|
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') |
|
|
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') |
|
|
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') |
|
|
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') |
|
|
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET') |
|
|
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "anthropic/claude-3.5-sonnet") |
|
|
APIFY_ACTOR_NAME = os.environ.get("APIFY_ACTOR_NAME", "pocesar/youtube-scraper") |
|
|
logger.info("Secret loading attempt finished.") |
|
|
logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}") |
|
|
if _apify_token_exists: |
|
|
logger.info(f"Using Apify Actor: {APIFY_ACTOR_NAME}") |
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(4), |
|
|
wait=wait_exponential(multiplier=1, min=2, max=15), |
|
|
retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), |
|
|
before_sleep=before_sleep_log(logger, logging.WARNING), |
|
|
reraise=True |
|
|
) |
|
|
async def retry_bot_operation(func, *args, **kwargs): |
|
|
"""Wrapper to retry bot operations with exponential backoff.""" |
|
|
try: |
|
|
return await func(*args, **kwargs) |
|
|
except BadRequest as e: |
|
|
|
|
|
ignore_errors = [ |
|
|
"message is not modified", |
|
|
"query is too old", |
|
|
"message to edit not found", |
|
|
"chat not found", |
|
|
"bot was blocked by the user", |
|
|
] |
|
|
if any(err in str(e).lower() for err in ignore_errors): |
|
|
logger.warning(f"Ignoring non-critical BadRequest during bot operation: {e}") |
|
|
return None |
|
|
logger.error(f"Potentially critical BadRequest during bot operation: {e}") |
|
|
raise |
|
|
except TelegramError as e: |
|
|
logger.warning(f"TelegramError during bot operation (will retry if applicable): {e}") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Unexpected error during bot operation: {e}", exc_info=True) |
|
|
raise |
|
|
|
|
|
|
|
|
|
|
|
def is_youtube_url(url): |
|
|
"""Checks if the URL is a valid YouTube video or shorts URL.""" |
|
|
youtube_regex = re.compile( |
|
|
r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' |
|
|
r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' |
|
|
r'([\w-]{11})' |
|
|
r'(?:\S+)?', |
|
|
re.IGNORECASE) |
|
|
match = youtube_regex.search(url) |
|
|
logger.debug(f"is_youtube_url check for '{url}': {'Match found' if match else 'No match'}") |
|
|
return bool(match) |
|
|
|
|
|
def extract_youtube_id(url): |
|
|
"""Extracts the YouTube video ID from a URL.""" |
|
|
youtube_regex = re.compile( |
|
|
r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' |
|
|
r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' |
|
|
r'([\w-]{11})' |
|
|
r'(?:\S+)?', |
|
|
re.IGNORECASE) |
|
|
match = youtube_regex.search(url) |
|
|
if match: |
|
|
video_id = match.group(1) |
|
|
logger.debug(f"Extracted YouTube ID '{video_id}' from URL: {url}") |
|
|
return video_id |
|
|
else: |
|
|
logger.warning(f"Could not extract YouTube ID from URL: {url}") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def fetch_url_content(url: str, timeout: int = 20) -> Optional[str]: |
|
|
"""Fetches content from a URL using httpx asynchronously.""" |
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', |
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
|
|
'Accept-Language': 'en-US,en;q=0.9', |
|
|
'Connection': 'keep-alive', |
|
|
} |
|
|
try: |
|
|
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers, http2=True) as client: |
|
|
response = await client.get(url) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
content = response.text |
|
|
logger.debug(f"Detected encoding for {url}: {response.encoding}") |
|
|
return content |
|
|
except UnicodeDecodeError: |
|
|
logger.warning(f"UnicodeDecodeError for {url} with encoding {response.encoding}. Trying raw bytes with utf-8.") |
|
|
|
|
|
return response.content.decode('utf-8', errors='ignore') |
|
|
except Exception as e: |
|
|
logger.error(f"Error decoding response for {url}: {e}") |
|
|
return None |
|
|
|
|
|
except httpx.HTTPStatusError as e: |
|
|
logger.error(f"HTTP error fetching {url}: {e.response.status_code} - {e}") |
|
|
except httpx.ConnectError as e: |
|
|
|
|
|
logger.error(f"Connection error fetching {url}: {e}") |
|
|
except httpx.TimeoutException as e: |
|
|
logger.error(f"Timeout error fetching {url}: {e}") |
|
|
except httpx.RequestError as e: |
|
|
logger.error(f"Request error fetching {url}: {e}") |
|
|
except Exception as e: |
|
|
logger.error(f"Unexpected error fetching {url}: {e}", exc_info=True) |
|
|
return None |
|
|
|
|
|
|
|
|
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]: |
|
|
"""Fetches YouTube transcript using Supadata API.""" |
|
|
if not api_key: return None |
|
|
api_url = f"https://api.supadata.net/youtube/transcript?video_id={video_id}" |
|
|
headers = {'X-API-Key': api_key, 'Accept': 'application/json'} |
|
|
logger.info(f"Attempting transcript fetch via Supadata for {video_id}") |
|
|
try: |
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(timeout=30.0) as client: |
|
|
response = await client.get(api_url, headers=headers) |
|
|
response.raise_for_status() |
|
|
data = response.json() |
|
|
if data and isinstance(data, list) and data[0].get("text"): |
|
|
transcript = " ".join([item["text"] for item in data if "text" in item]) |
|
|
logger.info(f"Supadata transcript fetched successfully for {video_id} (length: {len(transcript)})") |
|
|
return transcript |
|
|
else: |
|
|
logger.warning(f"Supadata response format unexpected or empty for {video_id}: {data}") |
|
|
return None |
|
|
except httpx.ConnectError as e: |
|
|
|
|
|
if "CERTIFICATE_VERIFY_FAILED" in str(e): |
|
|
logger.error(f"Supadata API SSL certificate verification failed for {video_id}: {e}. This is likely an issue with api.supadata.net's certificate.") |
|
|
else: |
|
|
logger.error(f"Supadata API connection error for {video_id}: {e}") |
|
|
except httpx.HTTPStatusError as e: |
|
|
logger.error(f"Supadata API HTTP error for {video_id}: {e.response.status_code} - {e}") |
|
|
except Exception as e: |
|
|
logger.error(f"Error fetching transcript via Supadata for {video_id}: {e}", exc_info=True) |
|
|
return None |
|
|
|
|
|
async def get_transcript_via_apify(video_id: str, api_token: str) -> Optional[str]: |
|
|
"""Fetches YouTube transcript using Apify YouTube Scraper Actor.""" |
|
|
global APIFY_ACTOR_NAME |
|
|
if not ApifyClient or not api_token: return None |
|
|
logger.info(f"Attempting transcript fetch via Apify (Actor: {APIFY_ACTOR_NAME}) for {video_id}") |
|
|
try: |
|
|
client = ApifyClient(api_token) |
|
|
|
|
|
actor = client.actor(APIFY_ACTOR_NAME) |
|
|
if not actor: |
|
|
logger.error(f"Could not find Apify actor: {APIFY_ACTOR_NAME}") |
|
|
return None |
|
|
|
|
|
actor_run = await asyncio.to_thread( |
|
|
actor.call, |
|
|
run_input={ |
|
|
"startUrls": [{"url": f"https://www.youtube.com/watch?v={video_id}"}], |
|
|
"maxResultStreams": 0, |
|
|
"maxResults": 1, |
|
|
"maxResultCommentStreams": 0, |
|
|
"proxyConfiguration": {"useApifyProxy": True}, |
|
|
"subtitles": True, |
|
|
"maxDurationMinutes": 0, |
|
|
"skipComments": True, |
|
|
|
|
|
}, |
|
|
timeout_secs=120, |
|
|
wait_secs=120 |
|
|
) |
|
|
|
|
|
if not actor_run or 'defaultDatasetId' not in actor_run: |
|
|
logger.warning(f"Apify actor run did not return expected dataset ID for {video_id}. Run details: {actor_run}") |
|
|
return None |
|
|
|
|
|
logger.info(f"Apify actor run started/retrieved for {video_id}. Dataset ID: {actor_run['defaultDatasetId']}") |
|
|
|
|
|
|
|
|
dataset = client.dataset(actor_run["defaultDatasetId"]) |
|
|
|
|
|
dataset_page = await asyncio.to_thread(dataset.list_items, limit=5) |
|
|
|
|
|
if dataset_page and dataset_page.items: |
|
|
for item in dataset_page.items: |
|
|
|
|
|
transcript_text = item.get('transcript') |
|
|
if not transcript_text and 'subtitles' in item: |
|
|
if isinstance(item['subtitles'], list) and len(item['subtitles']) > 0: |
|
|
transcript_text = " ".join(line.get('text', '') for line in item['subtitles'][0].get('lines', [])) |
|
|
elif isinstance(item['subtitles'], str): |
|
|
transcript_text = item['subtitles'] |
|
|
|
|
|
if transcript_text and isinstance(transcript_text, str) and transcript_text.strip(): |
|
|
logger.info(f"Apify transcript fetched successfully for {video_id} (length: {len(transcript_text)})") |
|
|
return transcript_text.strip() |
|
|
|
|
|
logger.warning(f"Apify run completed for {video_id}, but no transcript found in dataset items.") |
|
|
else: |
|
|
logger.warning(f"Apify run completed for {video_id}, but dataset was empty or inaccessible.") |
|
|
|
|
|
except ApifyApiError as e: |
|
|
|
|
|
logger.error(f"Apify API error fetching transcript for {video_id} (Actor: {APIFY_ACTOR_NAME}): {e}") |
|
|
except Exception as e: |
|
|
logger.error(f"Unexpected error fetching transcript via Apify for {video_id}: {e}", exc_info=True) |
|
|
return None |
|
|
|
|
|
|
|
|
async def get_youtube_transcript(video_id: str, url: str, supadata_key: Optional[str], apify_token: Optional[str]) -> Optional[str]: |
|
|
"""Tries different methods to get a YouTube transcript.""" |
|
|
transcript = None |
|
|
|
|
|
|
|
|
if supadata_key: |
|
|
transcript = await get_transcript_via_supadata(video_id, supadata_key) |
|
|
if transcript: return transcript |
|
|
|
|
|
|
|
|
logger.info(f"Attempting transcript fetch via youtube-transcript-api for {video_id}") |
|
|
try: |
|
|
transcript_list = await asyncio.to_thread(YouTubeTranscriptApi.get_transcript, video_id) |
|
|
transcript = " ".join([item['text'] for item in transcript_list]) |
|
|
logger.info(f"youtube-transcript-api transcript fetched successfully for {video_id} (length: {len(transcript)})") |
|
|
return transcript |
|
|
except (TranscriptsDisabled, NoTranscriptFound): |
|
|
logger.warning(f"Transcripts disabled or unavailable via youtube-transcript-api for {video_id}.") |
|
|
except Exception as e: |
|
|
logger.error(f"Error using youtube-transcript-api for {video_id}: {e}") |
|
|
|
|
|
|
|
|
if not transcript and apify_token: |
|
|
transcript = await get_transcript_via_apify(video_id, apify_token) |
|
|
if transcript: return transcript |
|
|
|
|
|
logger.warning(f"Failed to retrieve transcript for YouTube video {video_id} using all available methods.") |
|
|
return None |
|
|
|
|
|
async def get_website_content_via_requests(url: str) -> Optional[str]: |
|
|
"""Fetches and extracts main text content from a website using BeautifulSoup.""" |
|
|
logger.info(f"Attempting website scrape via requests/BeautifulSoup for: {url}") |
|
|
html_content = await fetch_url_content(url) |
|
|
if not html_content: |
|
|
return None |
|
|
|
|
|
try: |
|
|
def parse_html(content): |
|
|
|
|
|
soup = BeautifulSoup(content, DEFAULT_PARSER) |
|
|
for script_or_style in soup(["script", "style", "nav", "footer", "aside", "header", "form", "button", "iframe"]): |
|
|
script_or_style.decompose() |
|
|
|
|
|
main_content = soup.find('article') or soup.find('main') or soup.body |
|
|
if not main_content: main_content = soup |
|
|
|
|
|
text = main_content.get_text(separator='\n', strip=True) |
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
text = '\n'.join(chunk for chunk in chunks if chunk) |
|
|
return text |
|
|
|
|
|
text_content = await asyncio.to_thread(parse_html, html_content) |
|
|
|
|
|
if text_content and len(text_content) > 100: |
|
|
logger.info(f"Successfully scraped content via requests/BeautifulSoup for {url} (length: {len(text_content)})") |
|
|
return text_content |
|
|
else: |
|
|
logger.warning(f"Scraping via requests/BeautifulSoup for {url} yielded minimal content (length: {len(text_content) if text_content else 0}).") |
|
|
return None |
|
|
except Exception as e: |
|
|
logger.error(f"Error parsing website content with BeautifulSoup for {url}: {e}", exc_info=True) |
|
|
return None |
|
|
|
|
|
async def get_website_content_via_urltotext_api(url: str, api_key: str) -> Optional[str]: |
|
|
"""Fetches website content using the UrlToText API.""" |
|
|
if not api_key: return None |
|
|
api_endpoint = "https://api.urltotext.ai/text" |
|
|
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} |
|
|
payload = {"url": url, "text_only": True} |
|
|
logger.info(f"Attempting website content fetch via UrlToText API for: {url}") |
|
|
|
|
|
try: |
|
|
async with httpx.AsyncClient(timeout=45.0) as client: |
|
|
response = await client.post(api_endpoint, headers=headers, json=payload) |
|
|
response.raise_for_status() |
|
|
data = response.json() |
|
|
if "text" in data and data["text"]: |
|
|
content = data["text"] |
|
|
logger.info(f"Successfully fetched content via UrlToText API for {url} (length: {len(content)})") |
|
|
return content |
|
|
else: |
|
|
logger.warning(f"UrlToText API response did not contain text for {url}. Response: {data}") |
|
|
return None |
|
|
except httpx.ConnectError as e: |
|
|
|
|
|
logger.error(f"UrlToText API connection error for {url}: {e}. Check network/DNS.") |
|
|
except httpx.HTTPStatusError as e: |
|
|
logger.error(f"UrlToText API HTTP error for {url}: {e.response.status_code} - {e}") |
|
|
except Exception as e: |
|
|
logger.error(f"Error fetching content via UrlToText API for {url}: {e}", exc_info=True) |
|
|
return None |
|
|
|
|
|
|
|
|
async def generate_summary(content: str, summary_type: str, api_key: Optional[str]) -> str: |
|
|
"""Generates a summary using OpenRouter API.""" |
|
|
global OPENROUTER_MODEL |
|
|
if not api_key: |
|
|
return "Error: OpenRouter API key is not configured." |
|
|
if not content: |
|
|
return "Error: No content provided to summarize." |
|
|
|
|
|
if len(content) < 50: |
|
|
return "The provided content is too short to summarize effectively." |
|
|
|
|
|
max_chars = 100000 |
|
|
if len(content) > max_chars: |
|
|
logger.warning(f"Content length ({len(content)}) exceeds max_chars ({max_chars}), truncating.") |
|
|
content = content[:max_chars] |
|
|
|
|
|
prompt_template = """ |
|
|
Please summarize the following text. The summary should capture the key points and main ideas accurately and concisely. |
|
|
Provide the summary in {format_style} format. |
|
|
|
|
|
Text to summarize: |
|
|
--- |
|
|
{text} |
|
|
--- |
|
|
|
|
|
Summary ({format_style}): |
|
|
""" |
|
|
format_style = "a concise paragraph" if summary_type == "paragraph" else "bullet points (using * or - for each point)" |
|
|
prompt = prompt_template.format(text=content, format_style=format_style) |
|
|
|
|
|
logger.info(f"Sending request to OpenRouter (Model: {OPENROUTER_MODEL}) for {summary_type} summary.") |
|
|
|
|
|
try: |
|
|
async with httpx.AsyncClient(timeout=120.0) as client: |
|
|
response = await client.post( |
|
|
url="https://openrouter.ai/api/v1/chat/completions", |
|
|
headers={ |
|
|
"Authorization": f"Bearer {api_key}", |
|
|
"Content-Type": "application/json", |
|
|
|
|
|
|
|
|
|
|
|
}, |
|
|
json={ |
|
|
"model": OPENROUTER_MODEL, |
|
|
"messages": [{"role": "user", "content": prompt}], |
|
|
"max_tokens": 1024, |
|
|
|
|
|
}, |
|
|
) |
|
|
response.raise_for_status() |
|
|
data = response.json() |
|
|
|
|
|
if data.get("choices") and len(data["choices"]) > 0: |
|
|
summary = data["choices"][0].get("message", {}).get("content", "").strip() |
|
|
if summary: |
|
|
logger.info(f"Summary generated successfully (length: {len(summary)})") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
summary = summary.replace('_', r'\_').replace('*', r'\*').replace('[', r'\[').replace('`', r'\`') |
|
|
return summary |
|
|
else: |
|
|
logger.error("OpenRouter response successful, but summary content is empty.") |
|
|
return "Sorry, the AI generated an empty summary. Please try again." |
|
|
else: |
|
|
|
|
|
error_details = data.get("error") |
|
|
logger.error(f"OpenRouter response format unexpected or error: {error_details or data}") |
|
|
return f"Sorry, I received an unexpected response or error from the summarization service: {error_details}" |
|
|
|
|
|
except httpx.HTTPStatusError as e: |
|
|
error_body = "" |
|
|
try: error_body = e.response.text |
|
|
except Exception: pass |
|
|
logger.error(f"OpenRouter API HTTP error: {e.response.status_code} - {e}. Response body: {error_body}") |
|
|
return f"Sorry, there was an error communicating with the summarization service (HTTP {e.response.status_code})." |
|
|
except Exception as e: |
|
|
logger.error(f"Error generating summary via OpenRouter: {e}", exc_info=True) |
|
|
return "Sorry, an unexpected error occurred while generating the summary." |
|
|
|
|
|
|
|
|
|
|
|
async def process_summary_task( |
|
|
user_id: int, |
|
|
chat_id: int, |
|
|
message_id_to_edit: Optional[int], |
|
|
url: str, |
|
|
summary_type: str, |
|
|
bot_token: str |
|
|
) -> None: |
|
|
"""Handles the actual fetching and summarization in a background task.""" |
|
|
task_id = f"{user_id}-{message_id_to_edit or 'new'}" |
|
|
logger.info(f"[Task {task_id}] Starting processing for URL: {url}") |
|
|
|
|
|
|
|
|
background_request: Optional[BaseRequest] = None |
|
|
bot: Optional[Bot] = None |
|
|
try: |
|
|
background_request = HTTPXRequest( |
|
|
connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0, http_version="1.1" |
|
|
) |
|
|
bot = Bot(token=bot_token, request=background_request) |
|
|
except Exception as e: |
|
|
logger.critical(f"[Task {task_id}] Failed to create background bot instance: {e}", exc_info=True) |
|
|
|
|
|
return |
|
|
|
|
|
content = None |
|
|
user_feedback_message = None |
|
|
success = False |
|
|
final_summary = "" |
|
|
status_message_id = message_id_to_edit |
|
|
|
|
|
try: |
|
|
|
|
|
processing_message_text = f"⏳ Working on your '{summary_type}' summary for:\n`{url}`\n\n_(Fetching & summarizing...)_" |
|
|
if status_message_id: |
|
|
try: |
|
|
await retry_bot_operation( |
|
|
bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, |
|
|
text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None |
|
|
) |
|
|
logger.debug(f"[Task {task_id}] Successfully edited message {status_message_id} to 'Processing'") |
|
|
except Exception as e: |
|
|
logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Will send a new status message.") |
|
|
status_message_id = None |
|
|
if not status_message_id: |
|
|
try: |
|
|
status_message = await retry_bot_operation( |
|
|
bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN |
|
|
) |
|
|
if status_message: |
|
|
status_message_id = status_message.message_id |
|
|
logger.debug(f"[Task {task_id}] Sent new status message {status_message_id}") |
|
|
else: |
|
|
logger.error(f"[Task {task_id}] Failed to send new status message after retries.") |
|
|
raise RuntimeError("Failed to send initial status message") |
|
|
except Exception as e: |
|
|
logger.error(f"[Task {task_id}] Failed to send new status message: {e}") |
|
|
raise RuntimeError("Failed to send initial status message") from e |
|
|
|
|
|
|
|
|
try: |
|
|
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') |
|
|
is_yt = is_youtube_url(url) |
|
|
logger.debug(f"[Task {task_id}] URL is YouTube: {is_yt}") |
|
|
if is_yt: |
|
|
video_id = extract_youtube_id(url) |
|
|
if video_id: |
|
|
logger.info(f"[Task {task_id}] Fetching YouTube transcript for {video_id}") |
|
|
content = await get_youtube_transcript(video_id, url, SUPADATA_API_KEY, APIFY_API_TOKEN) |
|
|
if not content: user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video. It might be disabled or unavailable." |
|
|
else: user_feedback_message = "⚠️ Couldn't extract a valid YouTube video ID from the link." |
|
|
else: |
|
|
logger.info(f"[Task {task_id}] Attempting website scrape for: {url}") |
|
|
content = await get_website_content_via_requests(url) |
|
|
if not content and URLTOTEXT_API_KEY: |
|
|
logger.info(f"[Task {task_id}] Basic scrape failed/insufficient, trying UrlToText API...") |
|
|
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') |
|
|
content = await get_website_content_via_urltotext_api(url, URLTOTEXT_API_KEY) |
|
|
if not content: user_feedback_message = "⚠️ Sorry, I couldn't fetch or extract meaningful content from that website." |
|
|
|
|
|
if content: |
|
|
logger.info(f"[Task {task_id}] Content fetched (length: {len(content)}). Generating '{summary_type}' summary.") |
|
|
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing') |
|
|
final_summary = await generate_summary(content, summary_type, OPENROUTER_API_KEY) |
|
|
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): |
|
|
user_feedback_message = f"⚠️ {final_summary}" |
|
|
else: success = True |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"[Task {task_id}] Error during content fetching or summarization: {e}", exc_info=True) |
|
|
user_feedback_message = "❌ An unexpected error occurred while processing your request." |
|
|
|
|
|
|
|
|
if success and final_summary: |
|
|
max_length = 4096 |
|
|
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)] |
|
|
await retry_bot_operation( |
|
|
bot.send_message, chat_id=chat_id, text=summary_parts[0], |
|
|
parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True} |
|
|
) |
|
|
for part in summary_parts[1:]: |
|
|
await asyncio.sleep(0.5) |
|
|
await retry_bot_operation( |
|
|
bot.send_message, chat_id=chat_id, text=part, |
|
|
parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True} |
|
|
) |
|
|
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).") |
|
|
elif user_feedback_message: |
|
|
logger.warning(f"[Task {task_id}] Sending feedback/error message: {user_feedback_message}") |
|
|
await retry_bot_operation( |
|
|
bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} |
|
|
) |
|
|
else: |
|
|
logger.error(f"[Task {task_id}] Reached end of task without success or specific error message.") |
|
|
await retry_bot_operation( |
|
|
bot.send_message, chat_id=chat_id, text="❓ Something went wrong, but no specific error was identified.", |
|
|
link_preview_options={'is_disabled': True} |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.critical(f"[Task {task_id}] Critical error within task processing: {e}", exc_info=True) |
|
|
try: |
|
|
|
|
|
if bot: |
|
|
await retry_bot_operation( |
|
|
bot.send_message, chat_id=chat_id, |
|
|
text="❌ A critical internal error occurred. Please report this if it persists." |
|
|
) |
|
|
else: |
|
|
logger.error("[Task ??] Cannot send critical error message: Bot instance not available.") |
|
|
except Exception: |
|
|
logger.exception(f"[Task {task_id}] Failed even to send critical error message.") |
|
|
finally: |
|
|
|
|
|
if status_message_id and bot: |
|
|
try: |
|
|
await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=status_message_id) |
|
|
logger.debug(f"[Task {task_id}] Deleted status message {status_message_id}") |
|
|
except Exception as e: |
|
|
|
|
|
logger.warning(f"[Task {task_id}] Failed to delete status message {status_message_id}: {e}") |
|
|
|
|
|
|
|
|
|
|
|
if background_request and hasattr(background_request, '_client') and background_request._client: |
|
|
try: |
|
|
await background_request._client.aclose() |
|
|
logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.") |
|
|
except Exception as e: |
|
|
logger.warning(f"[Task {task_id}] Error closing background bot's HTTPX client: {e}") |
|
|
else: |
|
|
logger.debug(f"[Task {task_id}] Background bot's HTTPX client already closed or not found.") |
|
|
|
|
|
logger.info(f"[Task {task_id}] Task completed. Success: {success}") |
|
|
|
|
|
|
|
|
|
|
|
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: |
|
|
"""Handles the /start command.""" |
|
|
user = update.effective_user |
|
|
if not user or not update.message: return |
|
|
logger.info(f"User {user.id} initiated /start.") |
|
|
mention = user.mention_html() |
|
|
start_message = ( |
|
|
f"👋 Hello {mention}!\n\n" |
|
|
"I can summarise YouTube videos or web articles for you.\n\n" |
|
|
"Just send me a link (URL) and I'll ask you whether you want the summary as a paragraph or bullet points.\n\n" |
|
|
"Type /help for more details." |
|
|
) |
|
|
await update.message.reply_html(start_message) |
|
|
|
|
|
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: |
|
|
"""Handles the /help command.""" |
|
|
user = update.effective_user |
|
|
if not user or not update.message: return |
|
|
logger.info(f"User {user.id} requested /help.") |
|
|
help_text = ( |
|
|
"**How to Use Me:**\n" |
|
|
"1. Send me a direct link (URL) to a YouTube video or a web article.\n" |
|
|
"2. I will ask you to choose the summary format: `Paragraph` or `Points`.\n" |
|
|
"3. Click the button for your preferred format.\n" |
|
|
"4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n" |
|
|
"**Important Notes:**\n" |
|
|
"- **YouTube:** Transcript availability varies. I try multiple methods.\n" |
|
|
"- **Websites:** I attempt basic scraping and can use UrlToText API (if configured) for complex sites.\n" |
|
|
"- **AI Summaries:** Provided by OpenRouter (using model: `{model}`). Accuracy may vary.\n" |
|
|
"- **Length Limits:** Very long content might be truncated.\n\n" |
|
|
"Just send a link to get started!" |
|
|
).format(model=OPENROUTER_MODEL) |
|
|
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN) |
|
|
|
|
|
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: |
|
|
"""Handles messages containing potential URLs.""" |
|
|
if not update.message or not update.message.text: return |
|
|
message_text = update.message.text.strip() |
|
|
user = update.effective_user |
|
|
if not user: return |
|
|
|
|
|
url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+' |
|
|
match = re.search(url_pattern, message_text) |
|
|
|
|
|
if match: |
|
|
url = match.group(0) |
|
|
url = re.sub(r'[.,!?)\]>]+$', '', url) |
|
|
|
|
|
logger.info(f"User {user.id} sent potential URL: {url}") |
|
|
|
|
|
context.user_data['url_to_summarize'] = url |
|
|
context.user_data['original_message_id'] = update.message.message_id |
|
|
|
|
|
keyboard = [ |
|
|
[ |
|
|
InlineKeyboardButton("📜 Paragraph", callback_data="paragraph"), |
|
|
InlineKeyboardButton("🔹 Bullet Points", callback_data="points") |
|
|
] |
|
|
] |
|
|
reply_markup = InlineKeyboardMarkup(keyboard) |
|
|
await update.message.reply_text( |
|
|
f"✅ Link received:\n`{url}`\n\nChoose your desired summary format:", |
|
|
reply_markup=reply_markup, |
|
|
parse_mode=ParseMode.MARKDOWN, |
|
|
link_preview_options={'is_disabled': True} |
|
|
) |
|
|
elif not message_text.startswith('/'): |
|
|
logger.debug(f"User {user.id} sent non-URL, non-command text: '{message_text[:50]}...'") |
|
|
if "http" in message_text or "www." in message_text or ".com" in message_text or ".org" in message_text or ".net" in message_text: |
|
|
await update.message.reply_text("Hmm, that looks like it might be a link, but please ensure it starts with `http://` or `https://` and is a valid URL.") |
|
|
|
|
|
|
|
|
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: |
|
|
"""Handles button presses for summary type selection.""" |
|
|
query = update.callback_query |
|
|
if not query or not query.message or not query.from_user: |
|
|
logger.warning("Callback query received without essential data.") |
|
|
if query: await query.answer() |
|
|
return |
|
|
|
|
|
user = query.from_user |
|
|
summary_type = query.data |
|
|
query_id = query.id |
|
|
|
|
|
try: |
|
|
await query.answer() |
|
|
logger.debug(f"Acknowledged callback query {query_id} from user {user.id}") |
|
|
except Exception as e: |
|
|
logger.error(f"Error answering callback query {query_id} from user {user.id}: {e}", exc_info=True) |
|
|
|
|
|
url = context.user_data.get('url_to_summarize') |
|
|
message_id_to_edit = query.message.message_id |
|
|
|
|
|
logger.info(f"User {user.id} chose summary type '{summary_type}' for URL associated with message {message_id_to_edit}") |
|
|
|
|
|
if not url: |
|
|
logger.warning(f"No URL found in user_data for user {user.id} (callback query {query_id}). Editing message.") |
|
|
try: |
|
|
|
|
|
await query.edit_message_text(text="⚠️ Oops! I couldn't find the link associated with this request. Please send the link again.") |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to edit message to show 'URL not found' error: {e}") |
|
|
return |
|
|
|
|
|
context.user_data.pop('url_to_summarize', None) |
|
|
context.user_data.pop('original_message_id', None) |
|
|
|
|
|
if not TELEGRAM_TOKEN: |
|
|
logger.critical("TELEGRAM_TOKEN is missing, cannot start background task!") |
|
|
try: |
|
|
await query.edit_message_text(text="❌ Internal configuration error. Cannot process request.") |
|
|
except Exception: pass |
|
|
return |
|
|
|
|
|
logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, message {message_id_to_edit}, type {summary_type}") |
|
|
asyncio.create_task( |
|
|
process_summary_task( |
|
|
user_id=user.id, |
|
|
chat_id=query.message.chat_id, |
|
|
message_id_to_edit=message_id_to_edit, |
|
|
url=url, |
|
|
summary_type=summary_type, |
|
|
bot_token=TELEGRAM_TOKEN |
|
|
), |
|
|
name=f"SummaryTask-{user.id}-{message_id_to_edit}" |
|
|
) |
|
|
|
|
|
|
|
|
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None: |
|
|
"""Log Errors caused by Updates.""" |
|
|
|
|
|
if isinstance(context.error, AttributeError) and "'Bot' object has no attribute 'session'" in str(context.error): |
|
|
logger.debug(f"Ignoring known cleanup error in error_handler: {context.error}") |
|
|
return |
|
|
|
|
|
logger.error("Exception while handling an update:", exc_info=context.error) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def setup_bot_config() -> Application: |
|
|
"""Configures the PTB Application.""" |
|
|
logger.info("Configuring Telegram Application...") |
|
|
if not TELEGRAM_TOKEN: |
|
|
raise ValueError("TELEGRAM_TOKEN environment variable not set.") |
|
|
|
|
|
custom_request = HTTPXRequest( |
|
|
connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0, http_version="1.1" |
|
|
) |
|
|
|
|
|
application = ( |
|
|
Application.builder() |
|
|
.token(TELEGRAM_TOKEN) |
|
|
.request(custom_request) |
|
|
.build() |
|
|
) |
|
|
|
|
|
application.add_handler(CommandHandler("start", start)) |
|
|
application.add_handler(CommandHandler("help", help_command)) |
|
|
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url)) |
|
|
application.add_handler(CallbackQueryHandler(handle_summary_type_callback)) |
|
|
application.add_error_handler(error_handler) |
|
|
|
|
|
logger.info("Telegram application handlers configured.") |
|
|
return application |
|
|
|
|
|
|
|
|
@contextlib.asynccontextmanager |
|
|
async def lifespan(app: Starlette): |
|
|
"""Handles PTB startup and shutdown during ASGI lifespan.""" |
|
|
global ptb_app, WEBHOOK_SECRET |
|
|
logger.info("ASGI Lifespan: Startup sequence initiated...") |
|
|
|
|
|
if not TELEGRAM_TOKEN: |
|
|
logger.critical("TELEGRAM_TOKEN is not set. Bot cannot start.") |
|
|
raise RuntimeError("Telegram token missing.") |
|
|
|
|
|
bot_info_text = "Bot info not available yet." |
|
|
try: |
|
|
ptb_app = await setup_bot_config() |
|
|
await ptb_app.initialize() |
|
|
bot_info = await ptb_app.bot.get_me() |
|
|
bot_info_text = f"@{bot_info.username} (ID: {bot_info.id})" |
|
|
logger.info(f"Bot initialized: {bot_info_text}") |
|
|
|
|
|
current_webhook_info = await ptb_app.bot.get_webhook_info() |
|
|
if current_webhook_info and current_webhook_info.url: |
|
|
logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete it.") |
|
|
try: |
|
|
|
|
|
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): |
|
|
logger.info("Existing webhook deleted successfully.") |
|
|
else: |
|
|
logger.warning("Failed to delete existing webhook (API returned False).") |
|
|
except Exception as e: |
|
|
logger.warning(f"Could not delete existing webhook: {e}") |
|
|
await asyncio.sleep(1) |
|
|
|
|
|
space_host = os.environ.get("SPACE_HOST") |
|
|
webhook_path = "/webhook" |
|
|
full_webhook_url = None |
|
|
if space_host: |
|
|
protocol = "https://" |
|
|
host = space_host.split('://')[-1] |
|
|
full_webhook_url = f"{protocol}{host.rstrip('/')}{webhook_path}" |
|
|
|
|
|
if full_webhook_url: |
|
|
logger.info(f"Attempting to set webhook to: {full_webhook_url}") |
|
|
|
|
|
set_webhook_args = { |
|
|
"url": full_webhook_url, |
|
|
"allowed_updates": Update.ALL_TYPES, |
|
|
"drop_pending_updates": True, |
|
|
} |
|
|
if WEBHOOK_SECRET: |
|
|
set_webhook_args["secret_token"] = WEBHOOK_SECRET |
|
|
logger.info("Webhook will be set with a secret token.") |
|
|
|
|
|
await asyncio.sleep(1.0) |
|
|
try: |
|
|
await ptb_app.bot.set_webhook(**set_webhook_args) |
|
|
webhook_info = await ptb_app.bot.get_webhook_info() |
|
|
|
|
|
if webhook_info.url == full_webhook_url: |
|
|
logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Pending={webhook_info.pending_update_count}, Secret={bool(WEBHOOK_SECRET)}") |
|
|
else: |
|
|
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'") |
|
|
raise RuntimeError("Webhook URL mismatch after setting.") |
|
|
|
|
|
await ptb_app.start() |
|
|
logger.info("PTB Application started (webhook mode). Ready for updates.") |
|
|
except Exception as e: |
|
|
logger.error(f"FATAL: Failed to set webhook to {full_webhook_url}: {e}", exc_info=True) |
|
|
raise RuntimeError(f"Failed to set webhook: {e}") from e |
|
|
else: |
|
|
logger.critical("Could not construct valid HTTPS webhook URL from SPACE_HOST.") |
|
|
raise RuntimeError("Webhook URL could not be determined.") |
|
|
else: |
|
|
logger.critical("SPACE_HOST environment variable not found. Cannot set webhook for HF Space.") |
|
|
raise RuntimeError("SPACE_HOST env var missing, cannot run in webhook mode.") |
|
|
|
|
|
logger.info("ASGI Lifespan: Startup complete.") |
|
|
yield |
|
|
|
|
|
except Exception as startup_err: |
|
|
logger.critical(f"Application startup failed: {startup_err}", exc_info=True) |
|
|
if ptb_app: |
|
|
if ptb_app.running: await ptb_app.stop() |
|
|
await ptb_app.shutdown() |
|
|
raise |
|
|
finally: |
|
|
logger.info("ASGI Lifespan: Shutdown sequence initiated...") |
|
|
if ptb_app: |
|
|
if ptb_app.running: |
|
|
logger.info("Stopping PTB application...") |
|
|
await ptb_app.stop() |
|
|
logger.info("Shutting down PTB application...") |
|
|
await ptb_app.shutdown() |
|
|
logger.info("PTB Application shut down gracefully.") |
|
|
else: |
|
|
logger.info("PTB application was not initialized or startup failed.") |
|
|
logger.info("ASGI Lifespan: Shutdown complete.") |
|
|
|
|
|
|
|
|
|
|
|
async def health_check(request: Request) -> PlainTextResponse: |
|
|
"""Basic health check endpoint.""" |
|
|
bot_status = "Not Initialized" |
|
|
if ptb_app and ptb_app.bot: |
|
|
try: |
|
|
if ptb_app.running: |
|
|
|
|
|
bot_info = await ptb_app.bot.get_me() |
|
|
bot_status = f"Running (@{bot_info.username})" |
|
|
else: |
|
|
bot_status = "Initialized but not running" |
|
|
except Exception as e: |
|
|
bot_status = f"Error checking status: {e}" |
|
|
return PlainTextResponse(f"Telegram Bot Summarizer - Status: {bot_status}\nModel: {OPENROUTER_MODEL}\nApify Actor: {APIFY_ACTOR_NAME if _apify_token_exists else 'N/A'}") |
|
|
|
|
|
|
|
|
async def telegram_webhook(request: Request) -> Response: |
|
|
"""Webhook endpoint called by Telegram.""" |
|
|
global WEBHOOK_SECRET |
|
|
if not ptb_app: |
|
|
logger.error("Webhook received but PTB application not initialized.") |
|
|
return PlainTextResponse('Bot not initialized', status_code=503) |
|
|
if not ptb_app.running: |
|
|
logger.warning("Webhook received but PTB application not running.") |
|
|
return PlainTextResponse('Bot initialized but not running', status_code=503) |
|
|
|
|
|
try: |
|
|
|
|
|
if WEBHOOK_SECRET: |
|
|
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token") |
|
|
if token_header != WEBHOOK_SECRET: |
|
|
logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'") |
|
|
return Response(content="Invalid secret token", status_code=403) |
|
|
|
|
|
update_data = await request.json() |
|
|
update = Update.de_json(data=update_data, bot=ptb_app.bot) |
|
|
logger.debug(f"Processing update_id: {update.update_id} via webhook") |
|
|
|
|
|
await ptb_app.process_update(update) |
|
|
|
|
|
return Response(status_code=200) |
|
|
|
|
|
except json.JSONDecodeError: |
|
|
logger.error("Webhook received invalid JSON.") |
|
|
return PlainTextResponse('Bad Request: Invalid JSON', status_code=400) |
|
|
except Exception as e: |
|
|
logger.error(f"Error processing webhook update: {e}", exc_info=True) |
|
|
|
|
|
return Response(status_code=200) |
|
|
|
|
|
|
|
|
app = Starlette( |
|
|
debug=False, |
|
|
lifespan=lifespan, |
|
|
routes=[ |
|
|
Route("/", endpoint=health_check, methods=["GET"]), |
|
|
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), |
|
|
] |
|
|
) |
|
|
logger.info("Starlette ASGI application created with native routes.") |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
import uvicorn |
|
|
logger.warning("Running in development mode using Uvicorn directly (not for production)") |
|
|
|
|
|
log_level = os.environ.get("LOGGING_LEVEL", "info").lower() |
|
|
local_port = int(os.environ.get('PORT', 8080)) |
|
|
|
|
|
uvicorn.run("__main__:app", host='0.0.0.0', port=local_port, log_level=log_level, reload=True) |