Update main.py
Browse files
main.py
CHANGED
|
@@ -1,21 +1,22 @@
|
|
| 1 |
-
# main.py (Revised
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import logging
|
| 5 |
import asyncio
|
| 6 |
import json
|
| 7 |
-
import html
|
| 8 |
-
import contextlib
|
| 9 |
-
import traceback
|
|
|
|
| 10 |
|
| 11 |
# --- Frameworks ---
|
| 12 |
-
from flask import Flask, request, Response
|
| 13 |
-
from starlette.applications import Starlette
|
| 14 |
-
from starlette.routing import Mount
|
| 15 |
-
from starlette.middleware.wsgi import WSGIMiddleware
|
| 16 |
|
| 17 |
# --- Telegram Bot ---
|
| 18 |
-
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
|
| 19 |
from telegram.ext import (
|
| 20 |
Application,
|
| 21 |
CommandHandler,
|
|
@@ -25,7 +26,7 @@ from telegram.ext import (
|
|
| 25 |
CallbackQueryHandler,
|
| 26 |
)
|
| 27 |
from telegram.constants import ParseMode
|
| 28 |
-
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest
|
| 29 |
from telegram.request import HTTPXRequest
|
| 30 |
|
| 31 |
# --- Other Libraries ---
|
|
@@ -33,6 +34,8 @@ import httpx
|
|
| 33 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 34 |
import requests
|
| 35 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
| 36 |
_apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
|
| 37 |
if _apify_token_exists:
|
| 38 |
from apify_client import ApifyClient
|
|
@@ -55,8 +58,8 @@ logging.getLogger('starlette').setLevel(logging.INFO)
|
|
| 55 |
logger = logging.getLogger(__name__)
|
| 56 |
logger.info("Logging configured.")
|
| 57 |
|
| 58 |
-
# --- Global variable for PTB app
|
| 59 |
-
ptb_app: Application
|
| 60 |
|
| 61 |
# --- Environment Variable Loading ---
|
| 62 |
logger.info("Attempting to load secrets...")
|
|
@@ -73,14 +76,26 @@ SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
|
| 73 |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
| 74 |
logger.info("Secret loading attempt finished.")
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
# ---
|
| 78 |
-
# (Keep ALL your functions: is_youtube_url, extract_youtube_id,
|
| 79 |
-
# get_transcript_via_supadata, get_transcript_via_apify,
|
| 80 |
-
# get_youtube_transcript, get_website_content_via_requests,
|
| 81 |
-
# get_website_content_via_urltotext_api, generate_summary - unchanged)
|
| 82 |
-
|
| 83 |
-
# Helper Functions
|
| 84 |
def is_youtube_url(url):
|
| 85 |
"""Checks if the URL is a valid YouTube video or shorts URL."""
|
| 86 |
youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
|
|
@@ -100,608 +115,166 @@ def extract_youtube_id(url):
|
|
| 100 |
logger.warning(f"Could not extract YouTube ID from URL: {url}")
|
| 101 |
return None
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
if not api_key: logger.error("[Supadata] API key is missing."); return None
|
| 108 |
-
logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
|
| 109 |
-
api_endpoint = f"https://api.supadata.net/v1/youtube/transcript"
|
| 110 |
-
params = {"videoId": video_id, "format": "text"}
|
| 111 |
-
headers = {"X-API-Key": api_key}
|
| 112 |
-
try:
|
| 113 |
-
logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification - Potential Security Risk)")
|
| 114 |
-
response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
|
| 115 |
-
logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
|
| 116 |
-
if response.status_code == 200:
|
| 117 |
-
try:
|
| 118 |
-
data = response.json()
|
| 119 |
-
content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
| 120 |
-
if content and isinstance(content, str):
|
| 121 |
-
logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
|
| 122 |
-
return content.strip()
|
| 123 |
-
else:
|
| 124 |
-
logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
|
| 125 |
-
return None
|
| 126 |
-
except json.JSONDecodeError:
|
| 127 |
-
if response.text:
|
| 128 |
-
logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
|
| 129 |
-
return response.text.strip()
|
| 130 |
-
else:
|
| 131 |
-
logger.error(f"[Supadata] Failed to decode JSON response (and no text body) for {video_id}. Response: {response.text[:200]}...")
|
| 132 |
-
return None
|
| 133 |
-
except Exception as e:
|
| 134 |
-
logger.error(f"[Supadata] Error processing successful response for {video_id}: {e}", exc_info=True)
|
| 135 |
-
return None
|
| 136 |
-
elif response.status_code in [401, 403]:
|
| 137 |
-
logger.error(f"[Supadata] Authentication error ({response.status_code}). Check API key.")
|
| 138 |
-
return None
|
| 139 |
-
elif response.status_code == 404:
|
| 140 |
-
logger.warning(f"[Supadata] Transcript not found ({response.status_code}) for {video_id}.")
|
| 141 |
-
return None
|
| 142 |
-
else:
|
| 143 |
-
logger.error(f"[Supadata] Unexpected status code {response.status_code} for {video_id}. Response: {response.text[:200]}...")
|
| 144 |
-
return None
|
| 145 |
-
except requests.exceptions.Timeout:
|
| 146 |
-
logger.error(f"[Supadata] Timeout error connecting to API for {video_id}")
|
| 147 |
-
return None
|
| 148 |
-
except requests.exceptions.RequestException as e:
|
| 149 |
-
logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
|
| 150 |
-
if isinstance(e, requests.exceptions.SSLError):
|
| 151 |
-
logger.error(f"[Supadata] SSL Error occurred despite using verify=False. Details: {e}")
|
| 152 |
-
return None
|
| 153 |
-
except Exception as e:
|
| 154 |
-
logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
|
| 155 |
-
return None
|
| 156 |
-
|
| 157 |
-
# Apify Transcript Fetching (with fixed fallback parsing)
|
| 158 |
-
async def get_transcript_via_apify(video_url: str, api_token: str):
|
| 159 |
-
"""Fetches YouTube transcript via Apify API."""
|
| 160 |
-
if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
|
| 161 |
-
if not api_token: logger.error("[Apify] API token is missing."); return None
|
| 162 |
-
if not ApifyClient: logger.error("[Apify] ApifyClient not available/imported."); return None
|
| 163 |
-
|
| 164 |
-
logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
|
| 165 |
-
actor_id = "karamelo~youtube-transcripts"
|
| 166 |
-
api_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
| 167 |
-
params = {"token": api_token}
|
| 168 |
-
payload = json.dumps({
|
| 169 |
-
"urls": [video_url],
|
| 170 |
-
"outputFormat": "singleStringText",
|
| 171 |
-
"maxRetries": 3,
|
| 172 |
-
"channelHandleBoolean": False,
|
| 173 |
-
"channelNameBoolean": False,
|
| 174 |
-
"datePublishedBoolean": False,
|
| 175 |
-
"relativeDateTextBoolean": False,
|
| 176 |
-
})
|
| 177 |
-
headers = {"Content-Type": "application/json"}
|
| 178 |
-
try:
|
| 179 |
-
logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
|
| 180 |
-
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90)
|
| 181 |
-
logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
|
| 182 |
-
|
| 183 |
-
if response.status_code in [200, 201]:
|
| 184 |
-
try:
|
| 185 |
-
results = response.json()
|
| 186 |
-
if isinstance(results, list) and len(results) > 0:
|
| 187 |
-
item = results[0]
|
| 188 |
-
content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
|
| 189 |
-
|
| 190 |
-
if not content and item.get("captions"):
|
| 191 |
-
captions_data = item["captions"]
|
| 192 |
-
if isinstance(captions_data, str):
|
| 193 |
-
logger.info("[Apify] Processing 'captions' string format as fallback.")
|
| 194 |
-
content = captions_data.strip()
|
| 195 |
-
if len(content) < 50 and "error" in content.lower():
|
| 196 |
-
logger.warning(f"[Apify] 'captions' string looks like an error: {content}")
|
| 197 |
-
content = None
|
| 198 |
-
elif isinstance(captions_data, list):
|
| 199 |
-
logger.info("[Apify] Processing 'captions' list format as fallback.")
|
| 200 |
-
texts = [cap.get("text", "") for cap in captions_data if isinstance(cap, dict) and cap.get("text")]
|
| 201 |
-
content = " ".join(texts).strip()
|
| 202 |
-
else:
|
| 203 |
-
logger.warning(f"[Apify] 'captions' field found but is neither string nor list: {type(captions_data)}")
|
| 204 |
-
content = None
|
| 205 |
-
|
| 206 |
-
if content:
|
| 207 |
-
try:
|
| 208 |
-
content = html.unescape(content) # Use imported html module
|
| 209 |
-
except Exception as unescape_err:
|
| 210 |
-
logger.warning(f"[Apify] Error during html unescaping: {unescape_err}")
|
| 211 |
-
|
| 212 |
-
if content and isinstance(content, str):
|
| 213 |
-
logger.info(f"[Apify] Successfully fetched transcript via run-sync for {video_url} (Status: {response.status_code}). Length: {len(content)}")
|
| 214 |
-
return content
|
| 215 |
-
else:
|
| 216 |
-
if item.get("text") or item.get("transcript") or item.get("captions_concatenated"): logger.warning(f"[Apify] Actor success ({response.status_code}) but primary fields empty for {video_url}.")
|
| 217 |
-
elif not item.get("captions"): logger.warning(f"[Apify] Actor success ({response.status_code}) but no relevant fields found for {video_url}. Item: {item}")
|
| 218 |
-
else: logger.warning(f"[Apify] Actor success ({response.status_code}), 'captions' found but fallback parsing failed for {video_url}.")
|
| 219 |
-
return None
|
| 220 |
-
else:
|
| 221 |
-
logger.warning(f"[Apify] Actor success ({response.status_code}) but dataset result list empty for {video_url}. Response: {results}")
|
| 222 |
-
return None
|
| 223 |
-
except json.JSONDecodeError:
|
| 224 |
-
logger.error(f"[Apify] Failed JSON decode for {video_url}. Status: {response.status_code}. Resp: {response.text[:200]}...")
|
| 225 |
-
return None
|
| 226 |
-
except Exception as e:
|
| 227 |
-
logger.error(f"[Apify] Error processing successful response ({response.status_code}) for {video_url}: {e}", exc_info=True)
|
| 228 |
-
return None
|
| 229 |
-
elif response.status_code == 400: logger.error(f"[Apify] Bad Request (400) for {video_url}. Resp: {response.text[:200]}..."); return None
|
| 230 |
-
elif response.status_code == 401: logger.error("[Apify] Auth error (401). Check token."); return None
|
| 231 |
-
else: logger.error(f"[Apify] Unexpected status {response.status_code} for {video_url}. Resp: {response.text[:200]}..."); return None
|
| 232 |
-
|
| 233 |
-
except requests.exceptions.Timeout: logger.error(f"[Apify] Timeout error running actor for {video_url}"); return None
|
| 234 |
-
except requests.exceptions.RequestException as e: logger.error(f"[Apify] Request error running actor for {video_url}: {e}"); return None
|
| 235 |
-
except Exception as e: logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True); return None
|
| 236 |
-
|
| 237 |
-
# Combined YouTube Transcript Function
|
| 238 |
-
async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
|
| 239 |
-
"""Fetches YouTube transcript using library, then Supadata, then Apify."""
|
| 240 |
-
if not video_id: logger.error("get_youtube_transcript called with no video_id"); return None
|
| 241 |
-
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
| 242 |
-
transcript_text = None
|
| 243 |
-
try: # Primary: Library
|
| 244 |
-
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
| 245 |
-
transcript_list = await asyncio.to_thread(YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'])
|
| 246 |
-
if transcript_list:
|
| 247 |
-
transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
| 248 |
-
transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
|
| 249 |
-
if transcript_text: logger.info(f"[Primary YT] Success via library. Length: {len(transcript_text)}"); return transcript_text
|
| 250 |
-
else: logger.warning("[Primary YT] Joined text empty after cleaning."); transcript_text = None
|
| 251 |
-
else: logger.warning("[Primary YT] Transcript list empty."); transcript_text = None
|
| 252 |
-
except Exception as e:
|
| 253 |
-
logger.warning(f"[Primary YT] Error via library: {type(e).__name__} - {e}")
|
| 254 |
-
if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e): logger.warning("[Primary YT] IP likely blocked.")
|
| 255 |
-
elif "No transcript found" in str(e): logger.warning("[Primary YT] No transcript in specified languages.")
|
| 256 |
-
elif "TranscriptsDisabled" in str(e) or "disabled" in str(e): logger.warning("[Primary YT] Transcripts disabled for this video.")
|
| 257 |
-
transcript_text = None # Ensure it's None on error
|
| 258 |
-
|
| 259 |
-
if transcript_text is None: # Fallback 1: Supadata
|
| 260 |
-
logger.info("[Fallback YT 1] Trying Supadata API...")
|
| 261 |
-
if supadata_key:
|
| 262 |
-
transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
|
| 263 |
-
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata. Length: {len(transcript_text)}"); return transcript_text
|
| 264 |
-
else: logger.warning("[Fallback YT 1] Supadata failed or no content found.")
|
| 265 |
-
else: logger.warning("[Fallback YT 1] Supadata key not available, skipping.")
|
| 266 |
-
|
| 267 |
-
if transcript_text is None: # Fallback 2: Apify
|
| 268 |
-
logger.info("[Fallback YT 2] Trying Apify API...")
|
| 269 |
-
if apify_token:
|
| 270 |
-
transcript_text = await get_transcript_via_apify(video_url, apify_token)
|
| 271 |
-
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify. Length: {len(transcript_text)}"); return transcript_text
|
| 272 |
-
else: logger.warning("[Fallback YT 2] Apify failed or no content found.")
|
| 273 |
-
else: logger.warning("[Fallback YT 2] Apify token not available, skipping.")
|
| 274 |
-
|
| 275 |
-
if transcript_text is None: logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
|
| 276 |
-
return transcript_text
|
| 277 |
-
|
| 278 |
-
# Website Content via Requests/BS4
|
| 279 |
-
async def get_website_content_via_requests(url):
|
| 280 |
-
"""Attempts to scrape website content using requests/BeautifulSoup."""
|
| 281 |
-
if not url: logger.error("[Web Scraper - Requests/BS4] get_website_content_via_requests called with no URL"); return None
|
| 282 |
-
logger.info(f"[Web Scraper - Requests/BS4] Attempting fetch: {url}")
|
| 283 |
-
try:
|
| 284 |
-
headers = {
|
| 285 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
| 286 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
| 287 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
| 288 |
-
'Connection': 'keep-alive',
|
| 289 |
-
'DNT': '1',
|
| 290 |
-
'Upgrade-Insecure-Requests': '1'
|
| 291 |
-
}
|
| 292 |
-
response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
|
| 293 |
-
response.raise_for_status()
|
| 294 |
-
logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
|
| 295 |
-
|
| 296 |
-
content_type = response.headers.get('content-type', '').lower()
|
| 297 |
-
if 'html' not in content_type:
|
| 298 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received: {content_type}. Attempting plain text extraction.")
|
| 299 |
-
if 'text/plain' in content_type and response.text:
|
| 300 |
-
logger.info(f"[Web Scraper - Requests/BS4] Extracted plain text content. Length: {len(response.text.strip())}")
|
| 301 |
-
return response.text.strip()
|
| 302 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Content type '{content_type}' not suitable for parsing. Aborting.")
|
| 303 |
-
return None
|
| 304 |
-
|
| 305 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
| 306 |
-
tags_to_remove = ["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio", "picture", "source"]
|
| 307 |
-
selectors_to_remove = ['.ad', '#ad', '.ads', '#ads', '.advertisement', '#advertisement', '.banner', '#banner', '.menu', '#menu', '.navigation', '#navigation', '.sidebar', '#sidebar', '.social', '#social', '.share', '#share', '.related', '#related', '.comments', '#comments', '.cookie-consent', '#cookie-consent']
|
| 308 |
-
for tag in soup(tags_to_remove): tag.decompose()
|
| 309 |
-
for selector in selectors_to_remove:
|
| 310 |
-
for element in soup.select(selector): element.decompose()
|
| 311 |
-
|
| 312 |
-
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
|
| 313 |
-
target_element = main_content if main_content else soup.body
|
| 314 |
-
if not target_element:
|
| 315 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Could not find a suitable target element (main, article, body) for {url}");
|
| 316 |
-
return None
|
| 317 |
-
|
| 318 |
-
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
| 319 |
-
text = "\n\n".join(lines)
|
| 320 |
-
|
| 321 |
-
MIN_TEXT_LENGTH = 100
|
| 322 |
-
if not text or len(text) < MIN_TEXT_LENGTH:
|
| 323 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is too short (<{MIN_TEXT_LENGTH} chars) after cleaning for {url}. Length: {len(text)}. Content might be JS-rendered or blocked.")
|
| 324 |
-
return None
|
| 325 |
-
|
| 326 |
-
logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped and cleaned content from {url}. Final Length: {len(text)}")
|
| 327 |
-
return text
|
| 328 |
-
|
| 329 |
-
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout error fetching {url}"); return None
|
| 330 |
-
except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error for {url}"); return None
|
| 331 |
-
except requests.exceptions.HTTPError as e: logger.error(f"[Web Scraper - Requests/BS4] HTTP error {e.response.status_code} for {url}"); return None
|
| 332 |
-
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] General request error for {url}: {e}"); return None
|
| 333 |
-
except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Error during parsing or processing {url}: {e}", exc_info=True); return None
|
| 334 |
-
|
| 335 |
-
# Website Content via URLToText API
|
| 336 |
-
async def get_website_content_via_urltotext_api(url: str, api_key: str):
|
| 337 |
-
"""Fetches website content using the URLToText API."""
|
| 338 |
-
if not url: logger.error("[Web Scraper - URLToText API] get_website_content_via_urltotext_api called with no URL"); return None
|
| 339 |
-
if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
|
| 340 |
-
logger.info(f"[Web Scraper - URLToText API] Attempting fetch via API: {url}")
|
| 341 |
-
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
| 342 |
-
payload = json.dumps({
|
| 343 |
-
"url": url,
|
| 344 |
-
"output_format": "text",
|
| 345 |
-
"extract_main_content": True,
|
| 346 |
-
"render_javascript": True,
|
| 347 |
-
"residential_proxy": False,
|
| 348 |
-
"timeout_render": 20000,
|
| 349 |
-
})
|
| 350 |
-
headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
|
| 351 |
-
try:
|
| 352 |
-
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=60)
|
| 353 |
-
logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
|
| 354 |
-
if response.status_code == 200:
|
| 355 |
-
try:
|
| 356 |
-
data = response.json()
|
| 357 |
-
content_data = data.get("data", {})
|
| 358 |
-
content = content_data.get("content")
|
| 359 |
-
credits = data.get("credits_used", "N/A")
|
| 360 |
-
warning = content_data.get("warning")
|
| 361 |
-
error_msg = content_data.get("error")
|
| 362 |
-
|
| 363 |
-
if warning: logger.warning(f"[Web Scraper - URLToText API] API Warning for {url}: {warning}")
|
| 364 |
-
if error_msg: logger.error(f"[Web Scraper - URLToText API] API Error reported for {url}: {error_msg}"); return None
|
| 365 |
-
|
| 366 |
-
if content and isinstance(content, str):
|
| 367 |
-
logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API. Length: {len(content.strip())}. Credits Used: {credits}");
|
| 368 |
-
return content.strip()
|
| 369 |
-
else:
|
| 370 |
-
logger.warning(f"[Web Scraper - URLToText API] API returned status 200 but content is empty or invalid for {url}. Response: {data}");
|
| 371 |
-
return None
|
| 372 |
-
except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response from API. Status: {response.status_code}. Response Text: {response.text[:500]}..."); return None
|
| 373 |
-
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing successful API response: {e}", exc_info=True); return None
|
| 374 |
-
elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400) to API. Check payload/URL. Response: {response.text[:200]}...")
|
| 375 |
-
elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401). Check API Key. Response: {response.text[:200]}...")
|
| 376 |
-
elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402). Check API credits/plan. Response: {response.text[:200]}...")
|
| 377 |
-
elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL / Fetch Error (422) reported by API for {url}. Response: {response.text[:200]}...")
|
| 378 |
-
elif response.status_code == 429: logger.warning(f"[Web Scraper - URLToText API] Rate Limit Hit (429). Response: {response.text[:200]}...")
|
| 379 |
-
elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] API Server Error ({response.status_code}). Response: {response.text[:200]}...")
|
| 380 |
-
else: logger.error(f"[Web Scraper - URLToText API] Unexpected status code {response.status_code} from API. Response: {response.text[:200]}...")
|
| 381 |
-
return None
|
| 382 |
-
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout connecting to API for {url}"); return None
|
| 383 |
-
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API: {e}"); return None
|
| 384 |
-
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call: {e}", exc_info=True); return None
|
| 385 |
-
|
| 386 |
-
# DeepSeek Summary Function (with updated prompts)
|
| 387 |
-
async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
| 388 |
-
"""Generates summary using DeepSeek via OpenRouter API."""
|
| 389 |
-
logger.info(f"Generating '{summary_type}' summary. Input length: {len(text)}")
|
| 390 |
-
if not api_key: logger.error("OpenRouter API key missing."); return "Error: AI service configuration key is missing."
|
| 391 |
-
if not text or not text.strip(): logger.warning("generate_summary called with empty or whitespace-only text."); return "Error: No content was provided to summarize."
|
| 392 |
-
|
| 393 |
-
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
| 394 |
-
model_name = "deepseek/deepseek-chat:free"
|
| 395 |
-
|
| 396 |
-
if summary_type == "paragraph":
|
| 397 |
-
system_message = (
|
| 398 |
-
"You are an expert summarization AI. Your goal is to provide a concise, easy-to-understand summary of the provided text. "
|
| 399 |
-
"Follow these instructions precisely:\n"
|
| 400 |
-
"1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
|
| 401 |
-
"2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
|
| 402 |
-
"3. **Format:** Output a single paragraph only.\n"
|
| 403 |
-
"4. **Conciseness:** The summary must be **no more than 85 words** long.\n"
|
| 404 |
-
"5. **Completeness:** Cover the main points from the entire text, not just the start.\n"
|
| 405 |
-
"6. **Punctuation:** Do NOT use em dashes (– or —). Use semicolons (;) if needed for complex sentence structure, but prefer simpler sentences.\n"
|
| 406 |
-
"7. **Tone:** Maintain a neutral and informative tone.\n"
|
| 407 |
-
"8. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
|
| 408 |
-
)
|
| 409 |
-
user_prompt_instruction = "Summarize the following text into a single paragraph adhering strictly to the rules outlined in the system message:"
|
| 410 |
-
|
| 411 |
-
elif summary_type == "points":
|
| 412 |
-
system_message = (
|
| 413 |
-
"You are an expert summarization AI. Your goal is to extract the key points from the provided text and present them as a bulleted list. "
|
| 414 |
-
"Follow these instructions precisely:\n"
|
| 415 |
-
"1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
|
| 416 |
-
"2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
|
| 417 |
-
"3. **Format:** Output as a bulleted list. Start each point with a standard bullet character ('*' or '-'). Each point should be distinct and on a new line.\n"
|
| 418 |
-
"4. **Content:** Each bullet point should represent a single key finding, main topic, or significant piece of information from the text.\n"
|
| 419 |
-
"5. **Conciseness:** Keep each bullet point brief and to the point.\n"
|
| 420 |
-
"6. **Completeness:** Cover the main points from the entire text, not just the start.\n"
|
| 421 |
-
"7. **Punctuation:** Do NOT use em dashes (– or —) within bullet points.\n"
|
| 422 |
-
"8. **Tone:** Maintain a neutral and informative tone.\n"
|
| 423 |
-
"9. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
|
| 424 |
-
)
|
| 425 |
-
user_prompt_instruction = "Summarize the following text into a bulleted list adhering strictly to the rules outlined in the system message:"
|
| 426 |
-
else:
|
| 427 |
-
logger.error(f"Invalid summary_type '{summary_type}' requested.")
|
| 428 |
-
return f"Error: Invalid summary type ('{summary_type}') requested. Please choose 'paragraph' or 'points'."
|
| 429 |
-
|
| 430 |
-
MAX_INPUT_TOKENS_ESTIMATE = 28000
|
| 431 |
-
AVG_CHARS_PER_TOKEN = 4
|
| 432 |
-
MAX_INPUT_LENGTH = MAX_INPUT_TOKENS_ESTIMATE * AVG_CHARS_PER_TOKEN
|
| 433 |
-
|
| 434 |
-
if len(text) > MAX_INPUT_LENGTH:
|
| 435 |
-
logger.warning(f"Input text length ({len(text)} chars) exceeds estimated limit ({MAX_INPUT_LENGTH}). Truncating.")
|
| 436 |
-
truncation_marker = "\n\n[... Text truncated due to length ...]"
|
| 437 |
-
text = text[:MAX_INPUT_LENGTH - len(truncation_marker)] + truncation_marker
|
| 438 |
-
|
| 439 |
-
messages = [
|
| 440 |
-
{"role": "system", "content": system_message},
|
| 441 |
-
{"role": "user", "content": f"{user_prompt_instruction}\n\n--- TEXT TO SUMMARIZE ---\n\n{text}\n\n--- END OF TEXT ---"}
|
| 442 |
-
]
|
| 443 |
-
|
| 444 |
-
space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME")
|
| 445 |
-
referer_url = f"https://{space_host}" if space_host and not space_host.startswith("http") else space_host or "https://huggingface.co"
|
| 446 |
-
headers = {
|
| 447 |
-
"Authorization": f"Bearer {api_key}",
|
| 448 |
-
"Content-Type": "application/json",
|
| 449 |
-
"HTTP-Referer": referer_url,
|
| 450 |
-
"X-Title": "Telegram URL Summarizer Bot"
|
| 451 |
-
}
|
| 452 |
-
payload = json.dumps({"model": model_name, "messages": messages})
|
| 453 |
-
|
| 454 |
-
try:
|
| 455 |
-
logger.debug(f"Sending request to OpenRouter (Model: {model_name}). Prompt length approx: {len(text)} chars.")
|
| 456 |
-
response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=120)
|
| 457 |
-
logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
| 458 |
-
|
| 459 |
-
if response.status_code == 200:
|
| 460 |
-
try:
|
| 461 |
-
data = response.json()
|
| 462 |
-
choice = data.get("choices", [{}])[0]
|
| 463 |
-
message = choice.get("message", {})
|
| 464 |
-
summary = message.get("content")
|
| 465 |
-
finish_reason = choice.get("finish_reason")
|
| 466 |
-
|
| 467 |
-
if summary and isinstance(summary, str) and summary.strip():
|
| 468 |
-
summary = summary.strip()
|
| 469 |
-
logger.info(f"Successfully generated summary. Finish Reason: {finish_reason}. Length: {len(summary)}")
|
| 470 |
-
if summary_type == "paragraph" and len(summary.split()) > 95:
|
| 471 |
-
logger.warning(f"Generated paragraph summary slightly longer than target word count ({len(summary.split())} words).")
|
| 472 |
-
return summary
|
| 473 |
-
else:
|
| 474 |
-
logger.warning(f"OpenRouter returned status 200 but summary content is missing or empty. Response data: {data}")
|
| 475 |
-
return "Sorry, the AI model returned an empty summary. The content might have been unsuitable."
|
| 476 |
-
|
| 477 |
-
except (json.JSONDecodeError, IndexError, KeyError, AttributeError) as e:
|
| 478 |
-
logger.error(f"Failed to parse successful (200) response from OpenRouter. Error: {e}. Response Text: {response.text[:500]}...", exc_info=True)
|
| 479 |
-
return "Sorry, there was an issue parsing the response from the AI service."
|
| 480 |
-
except Exception as e:
|
| 481 |
-
logger.error(f"Unexpected error processing OpenRouter success response: {e}", exc_info=True)
|
| 482 |
-
return "Sorry, an unexpected error occurred while processing the AI response."
|
| 483 |
-
|
| 484 |
-
elif response.status_code == 401: logger.error("OpenRouter API key is invalid (Unauthorized - 401)."); return "Error: AI service authentication failed. Please check the configuration."
|
| 485 |
-
elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check credits/limits."); return "Sorry, there's an issue with the AI service account limits or payment."
|
| 486 |
-
elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Hit (429)."); return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
|
| 487 |
-
elif response.status_code == 400: logger.error(f"OpenRouter Bad Request (400). Likely prompt issue. Response: {response.text[:500]}..."); return "Sorry, the request to the AI service was invalid (possibly due to the content or prompt)."
|
| 488 |
-
elif response.status_code >= 500: logger.error(f"OpenRouter Server Error ({response.status_code}). Response: {response.text[:500]}..."); return "Sorry, the AI service is experiencing internal issues. Please try again later."
|
| 489 |
-
else:
|
| 490 |
-
logger.error(f"Unexpected HTTP status {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
|
| 491 |
-
try:
|
| 492 |
-
error_data = response.json()
|
| 493 |
-
error_msg = error_data.get("error", {}).get("message", response.text[:100])
|
| 494 |
-
return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
|
| 495 |
-
except json.JSONDecodeError:
|
| 496 |
-
return f"Sorry, the AI service returned an unexpected error (Status: {response.status_code})."
|
| 497 |
-
|
| 498 |
-
except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter API."); return "Sorry, the request to the AI model timed out. Please try again."
|
| 499 |
-
except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was a network error connecting to the AI model service."
|
| 500 |
-
except Exception as e: logger.error(f"Unexpected error occurred within generate_summary function: {e}", exc_info=True); return "Sorry, an unexpected internal error occurred while generating the summary."
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
# --- Background Task Processing ---
|
| 504 |
|
|
|
|
| 505 |
async def process_summary_task(
|
| 506 |
user_id: int,
|
| 507 |
chat_id: int,
|
| 508 |
message_id_to_edit: int,
|
| 509 |
url: str,
|
| 510 |
summary_type: str,
|
| 511 |
-
|
| 512 |
) -> None:
|
| 513 |
"""Handles the actual fetching and summarization in a background task."""
|
| 514 |
-
task_id =
|
| 515 |
-
logger.info(f"[Task {task_id}] Starting processing for
|
| 516 |
-
|
| 517 |
-
#
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
current_supadata_key = os.environ.get('SUPADATA_API_KEY')
|
| 521 |
-
current_apify_token = os.environ.get('APIFY_API_TOKEN')
|
| 522 |
-
# Keys check (already done in handler, but good for task log context)
|
| 523 |
-
keys_present = f"OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}"
|
| 524 |
-
logger.debug(f"[Task {task_id}] API Key check: {keys_present}")
|
| 525 |
-
|
| 526 |
-
if not current_openrouter_key:
|
| 527 |
-
logger.error(f"[Task {task_id}] CRITICAL: OpenRouter API key is missing. Cannot generate summary.")
|
| 528 |
-
try:
|
| 529 |
-
# Edit the original message to show the config error
|
| 530 |
-
await bot.edit_message_text(
|
| 531 |
-
chat_id=chat_id,
|
| 532 |
-
message_id=message_id_to_edit,
|
| 533 |
-
text="❌ Configuration Error: The AI summarization service is not configured correctly. Please contact the administrator."
|
| 534 |
-
)
|
| 535 |
-
except Exception as edit_err:
|
| 536 |
-
logger.error(f"[Task {task_id}] Failed to edit message for missing AI key: {edit_err}")
|
| 537 |
-
return # Stop task
|
| 538 |
-
|
| 539 |
-
# --- Inform User Processing Has Started ---
|
| 540 |
-
processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
|
| 541 |
-
status_message_sent_id = None # Track if we sent a separate message
|
| 542 |
-
|
| 543 |
try:
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
content = None
|
| 568 |
-
user_feedback_message = None
|
| 569 |
-
success = False
|
| 570 |
|
| 571 |
-
|
| 572 |
-
# Send 'typing' action to indicate activity
|
| 573 |
-
try:
|
| 574 |
-
logger.debug(f"[Task {task_id}] Sending 'typing' chat action to chat {chat_id}")
|
| 575 |
-
await bot.send_chat_action(chat_id=chat_id, action='typing')
|
| 576 |
-
except Exception as ca_err:
|
| 577 |
-
logger.warning(f"[Task {task_id}] Failed sending 'typing' action: {ca_err}")
|
| 578 |
|
| 579 |
-
# ---
|
| 580 |
-
|
| 581 |
-
|
|
|
|
| 582 |
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
else:
|
| 594 |
-
logger.
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
|
|
|
|
|
|
|
|
|
| 600 |
if content:
|
| 601 |
-
logger.info(f"[Task {task_id}]
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
try: await bot.send_chat_action(chat_id=chat_id, action='typing'); logger.debug("[Task {task_id}] Sent typing before fallback scrape.")
|
| 606 |
-
except: pass
|
| 607 |
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
if content:
|
| 611 |
-
logger.info(f"[Task {task_id}] Website scrape successful via URLToText API. Length: {len(content)}")
|
| 612 |
-
else:
|
| 613 |
-
logger.warning(f"[Task {task_id}] Fallback website scrape (URLToText API) also failed for {url}.")
|
| 614 |
-
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website using available methods. It might be protected or structured in a way I can't parse."
|
| 615 |
else:
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
if summary.startswith("Error:") or summary.startswith("Sorry,"):
|
| 628 |
-
logger.warning(f"[Task {task_id}] AI summary generation failed. Reason: {summary}")
|
| 629 |
-
user_feedback_message = f"⚠️ {summary}"
|
| 630 |
-
else:
|
| 631 |
-
# --- Summary Success - Send to User ---
|
| 632 |
-
logger.info(f"[Task {task_id}] Summary generated successfully. Length: {len(summary)}. Sending result.")
|
| 633 |
-
try:
|
| 634 |
-
await bot.send_message(
|
| 635 |
-
chat_id=chat_id,
|
| 636 |
-
text=summary,
|
| 637 |
-
parse_mode=ParseMode.MARKDOWN,
|
| 638 |
-
link_preview_options={'is_disabled': True}
|
| 639 |
-
)
|
| 640 |
success = True
|
| 641 |
-
user_feedback_message = None
|
| 642 |
-
logger.info(f"[Task {task_id}] Successfully sent summary to chat {chat_id}.")
|
| 643 |
-
except Exception as send_final_err:
|
| 644 |
-
logger.error(f"[Task {task_id}] Failed sending final summary to chat {chat_id}: {send_final_err}", exc_info=True)
|
| 645 |
-
user_feedback_message = "⚠️ Sorry, an unexpected error occurred while sending the final summary."
|
| 646 |
-
success = False
|
| 647 |
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
|
| 652 |
# --- Send Final Feedback Message if Processing Failed ---
|
| 653 |
if user_feedback_message and not success:
|
| 654 |
-
|
| 655 |
-
|
| 656 |
await bot.send_message(chat_id=chat_id, text=user_feedback_message)
|
| 657 |
-
|
| 658 |
-
|
| 659 |
|
| 660 |
except Exception as e:
|
| 661 |
-
|
| 662 |
-
logger.error(f"[Task {task_id}] Unexpected critical error during task processing for user {user_id}, URL {url}: {e}", exc_info=True)
|
| 663 |
try:
|
| 664 |
-
await bot.send_message(
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
|
|
|
| 669 |
finally:
|
| 670 |
# --- Clean up Status Message(s) ---
|
| 671 |
-
logger.debug(f"[Task {task_id}] Cleaning up status message(s). Success={success}")
|
| 672 |
try:
|
| 673 |
if status_message_sent_id:
|
| 674 |
-
# If we sent a separate "Working..." message, delete it.
|
| 675 |
await bot.delete_message(chat_id=chat_id, message_id=status_message_sent_id)
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
except Exception as del_e:
|
| 694 |
-
logger.warning(f"[Task {task_id}] Could not delete status/button message during cleanup: {del_e!r}")
|
| 695 |
|
| 696 |
-
logger.info(f"[Task {task_id}]
|
| 697 |
|
| 698 |
# --- Telegram Bot Handlers ---
|
| 699 |
-
|
| 700 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 701 |
"""Handles the /start command."""
|
| 702 |
user = update.effective_user
|
| 703 |
if not user: return
|
| 704 |
-
logger.info(f"User {user.id}
|
| 705 |
mention = user.mention_html() if user.username else user.first_name
|
| 706 |
start_message = (
|
| 707 |
f"👋 Hello {mention}!\n\n"
|
|
@@ -717,15 +290,14 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
| 717 |
logger.info(f"User {user.id if user else '?'} requested /help.")
|
| 718 |
help_text = (
|
| 719 |
"**How to Use Me:**\n"
|
| 720 |
-
"1.
|
| 721 |
-
"2.
|
| 722 |
-
"3.
|
| 723 |
-
"4.
|
| 724 |
"**Important Notes:**\n"
|
| 725 |
-
"- **YouTube:** Getting transcripts can sometimes fail if they are disabled
|
| 726 |
-
"- **Websites:**
|
| 727 |
-
"- **AI Summaries:** The AI tries its best to be accurate
|
| 728 |
-
"- **Length:** Very long articles or videos might be truncated before summarization to fit within processing limits.\n\n"
|
| 729 |
"Just send a link to get started!"
|
| 730 |
)
|
| 731 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
|
@@ -742,9 +314,8 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
| 742 |
|
| 743 |
if match:
|
| 744 |
url = match.group(0)
|
| 745 |
-
logger.info(f"User {user.id} sent
|
| 746 |
context.user_data['url_to_summarize'] = url
|
| 747 |
-
logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
|
| 748 |
|
| 749 |
keyboard = [
|
| 750 |
[
|
|
@@ -759,123 +330,83 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
| 759 |
parse_mode=ParseMode.MARKDOWN,
|
| 760 |
link_preview_options={'is_disabled': True}
|
| 761 |
)
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
|
| 765 |
-
|
| 766 |
|
| 767 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 768 |
-
"""Handles button presses
|
| 769 |
query = update.callback_query
|
| 770 |
if not query or not query.from_user or not query.message:
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
except Exception: pass
|
| 776 |
return
|
| 777 |
-
user = query.from_user
|
| 778 |
-
|
| 779 |
-
# We skip query.answer() here to avoid potential connection issues.
|
| 780 |
-
# The button might stay loading visually for the user.
|
| 781 |
|
|
|
|
|
|
|
|
|
|
| 782 |
summary_type = query.data
|
| 783 |
url = context.user_data.get('url_to_summarize')
|
| 784 |
-
query_id = query.id
|
| 785 |
|
| 786 |
-
logger.info(f"User {user.id} chose summary type '{summary_type}'
|
| 787 |
|
| 788 |
if not url:
|
| 789 |
-
logger.warning(f"
|
| 790 |
try:
|
| 791 |
await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
|
| 792 |
-
except Exception as
|
| 793 |
-
logger.error(f"Failed to edit message
|
| 794 |
-
# Still try to answer the query if editing failed
|
| 795 |
-
try: await query.answer("Error processing request.")
|
| 796 |
-
except Exception: pass
|
| 797 |
return
|
| 798 |
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
# Extract necessary IDs before clearing data
|
| 802 |
-
user_id = user.id
|
| 803 |
-
chat_id = query.message.chat_id
|
| 804 |
-
message_id_to_edit = query.message.message_id
|
| 805 |
-
bot_instance = context.bot # Get the bot instance from context
|
| 806 |
-
|
| 807 |
-
# Clear the URL from context *before* scheduling the task
|
| 808 |
context.user_data.pop('url_to_summarize', None)
|
| 809 |
-
logger.debug(f"Cleared URL from user_data for user {user_id} (Query {query_id})")
|
| 810 |
|
| 811 |
-
# Schedule
|
| 812 |
-
# Pass all required data explicitly
|
| 813 |
asyncio.create_task(
|
| 814 |
process_summary_task(
|
| 815 |
-
user_id=
|
| 816 |
-
chat_id=chat_id,
|
| 817 |
-
message_id_to_edit=
|
| 818 |
url=url,
|
| 819 |
summary_type=summary_type,
|
| 820 |
-
|
| 821 |
),
|
| 822 |
-
name=f"SummaryTask-{
|
| 823 |
)
|
| 824 |
|
| 825 |
-
# Log that the task was scheduled and the handler is returning.
|
| 826 |
-
logger.debug(f"Callback handler for Query {query_id} finished after scheduling task.")
|
| 827 |
-
# DO NOT await the task here. Let the handler return immediately.
|
| 828 |
-
|
| 829 |
-
|
| 830 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 831 |
"""Log Errors caused by Updates or background tasks."""
|
| 832 |
-
# Check if the error is from an Exception raised in a handler
|
| 833 |
if context.error:
|
| 834 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
| 835 |
-
if isinstance(context.error, TimedOut):
|
| 836 |
-
logger.warning("A timeout error occurred in PTB communication.")
|
| 837 |
-
elif isinstance(context.error, NetworkError):
|
| 838 |
-
logger.warning(f"A network error occurred: {context.error}")
|
| 839 |
-
# Add more specific error handling if needed
|
| 840 |
-
else:
|
| 841 |
-
# Log errors from background tasks if PTB captures them this way (might need custom handling)
|
| 842 |
-
logger.error(f"Unknown error occurred. Update: {update} | Context: {context}")
|
| 843 |
-
|
| 844 |
|
| 845 |
# --- Bot Setup Function ---
|
| 846 |
async def setup_bot_config() -> Application:
|
| 847 |
-
"""Configures the PTB Application
|
| 848 |
logger.info("Configuring Telegram Application...")
|
| 849 |
if not TELEGRAM_TOKEN:
|
| 850 |
-
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
|
| 851 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
| 852 |
|
| 853 |
-
connect_timeout = 10.0
|
| 854 |
-
read_timeout = 30.0
|
| 855 |
-
write_timeout = 30.0
|
| 856 |
-
pool_timeout = 30.0
|
| 857 |
-
|
| 858 |
-
logger.info(f"Creating PTB HTTPXRequest (v20 compatible) with settings: "
|
| 859 |
-
f"connect_timeout={connect_timeout}, read_timeout={read_timeout}, "
|
| 860 |
-
f"write_timeout={write_timeout}, pool_timeout={pool_timeout}. "
|
| 861 |
-
f"(Pool size uses httpx default)")
|
| 862 |
-
|
| 863 |
custom_request = HTTPXRequest(
|
| 864 |
-
connect_timeout=
|
| 865 |
-
read_timeout=
|
| 866 |
-
write_timeout=
|
| 867 |
-
pool_timeout=
|
| 868 |
http_version="1.1"
|
| 869 |
)
|
| 870 |
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
|
|
|
|
|
|
|
|
|
| 874 |
|
| 875 |
application.add_handler(CommandHandler("start", start))
|
| 876 |
application.add_handler(CommandHandler("help", help_command))
|
| 877 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
| 878 |
-
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
| 879 |
application.add_error_handler(error_handler)
|
| 880 |
|
| 881 |
logger.info("Telegram application handlers configured.")
|
|
@@ -890,140 +421,78 @@ async def lifespan(app: Starlette):
|
|
| 890 |
|
| 891 |
try:
|
| 892 |
ptb_app = await setup_bot_config()
|
| 893 |
-
logger.info("PTB Application object configured. Initializing...")
|
| 894 |
await ptb_app.initialize()
|
| 895 |
-
|
| 896 |
-
await ptb_app.start() # Starts dispatcher, job queue, etc. but NOT polling
|
| 897 |
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
logger.info(f"PTB Application started successfully. Bot ID: {bot_info.id}, Username: @{bot_info.username}")
|
| 901 |
|
| 902 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
| 903 |
if WEBHOOK_URL_BASE:
|
| 904 |
-
if not WEBHOOK_URL_BASE.startswith("https://"):
|
|
|
|
| 905 |
webhook_path = "/webhook"
|
| 906 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
| 907 |
|
| 908 |
-
logger.info(f"
|
| 909 |
-
await asyncio.sleep(2.0)
|
| 910 |
try:
|
| 911 |
-
await
|
| 912 |
url=full_webhook_url,
|
| 913 |
allowed_updates=Update.ALL_TYPES,
|
| 914 |
-
|
| 915 |
)
|
| 916 |
-
webhook_info = await
|
| 917 |
-
|
| 918 |
-
logger.info(f"Telegram webhook set successfully! Current info: {webhook_info}")
|
| 919 |
-
elif webhook_info:
|
| 920 |
-
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got: {webhook_info.url}. Info: {webhook_info}")
|
| 921 |
-
else:
|
| 922 |
-
logger.error("Failed to get webhook info after setting webhook.")
|
| 923 |
-
except RetryAfter as e:
|
| 924 |
-
logger.warning(f"Webhook setting throttled by Telegram (RetryAfter: {e.retry_after}s). Another instance likely succeeded or try again later.")
|
| 925 |
-
await asyncio.sleep(e.retry_after or 2)
|
| 926 |
-
try:
|
| 927 |
-
webhook_info = await bot_instance.get_webhook_info()
|
| 928 |
-
logger.info(f"Webhook info after RetryAfter delay: {webhook_info}")
|
| 929 |
-
except Exception as get_info_err:
|
| 930 |
-
logger.error(f"Failed to get webhook info after RetryAfter delay: {get_info_err}")
|
| 931 |
except Exception as e:
|
| 932 |
-
|
| 933 |
-
else:
|
| 934 |
-
logger.warning("SPACE_HOST environment variable not found. Cannot set webhook automatically. Bot will not receive updates via webhook.")
|
| 935 |
|
| 936 |
-
logger.info("ASGI Lifespan: Startup complete.
|
| 937 |
-
yield
|
| 938 |
|
| 939 |
except Exception as startup_err:
|
| 940 |
-
logger.critical(f"
|
| 941 |
-
# Log traceback explicitly before raising might help in some environments
|
| 942 |
-
traceback.print_exc()
|
| 943 |
raise
|
| 944 |
finally:
|
| 945 |
-
# --- Shutdown Sequence ---
|
| 946 |
logger.info("ASGI Lifespan: Shutdown sequence initiated...")
|
| 947 |
if ptb_app:
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
await ptb_app.stop()
|
| 955 |
-
logger.info("Shutting down PTB Application connections and resources...")
|
| 956 |
-
await ptb_app.shutdown()
|
| 957 |
-
logger.info("PTB Application shut down gracefully.")
|
| 958 |
-
except Exception as shutdown_err:
|
| 959 |
-
logger.error(f"Error during PTB Application shutdown: {shutdown_err}", exc_info=True)
|
| 960 |
-
else:
|
| 961 |
-
logger.warning("PTB Application instance exists but was not marked as running at shutdown.")
|
| 962 |
-
try:
|
| 963 |
-
await ptb_app.shutdown()
|
| 964 |
-
logger.info("Attempted shutdown of non-running PTB app completed.")
|
| 965 |
-
except Exception as shutdown_err:
|
| 966 |
-
logger.error(f"Error during shutdown of non-running PTB app: {shutdown_err}", exc_info=True)
|
| 967 |
-
else:
|
| 968 |
-
logger.warning("No PTB Application instance (ptb_app) found during ASGI shutdown.")
|
| 969 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
| 970 |
|
| 971 |
-
|
| 972 |
-
# --- Flask App Setup (for Webhook Route) ---
|
| 973 |
flask_core_app = Flask(__name__)
|
| 974 |
-
logger.info("Core Flask app instance created (used by Starlette for routing).")
|
| 975 |
|
| 976 |
-
# --- Define Flask Routes ---
|
| 977 |
@flask_core_app.route('/')
|
| 978 |
def index():
|
| 979 |
"""Basic health check endpoint."""
|
| 980 |
-
|
| 981 |
-
bot_status = "Unknown / Not Initialized"
|
| 982 |
if ptb_app and ptb_app.bot:
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
return f"Telegram Bot Summarizer - Status: {bot_status} - Listening via Starlette/Uvicorn."
|
| 986 |
|
| 987 |
@flask_core_app.route('/webhook', methods=['POST'])
|
| 988 |
async def webhook() -> Response:
|
| 989 |
"""Webhook endpoint called by Telegram."""
|
| 990 |
-
global ptb_app
|
| 991 |
-
|
| 992 |
if not ptb_app:
|
| 993 |
-
|
| 994 |
-
return Response('Bot service is not configured or failed during startup.', status=503)
|
| 995 |
|
| 996 |
-
is_running = getattr(ptb_app, '_running', False)
|
| 997 |
-
if not is_running:
|
| 998 |
-
logger.error("Webhook triggered, but PTB Application is not currently running.")
|
| 999 |
-
return Response('Bot service is initialized but not actively running.', status=503)
|
| 1000 |
-
|
| 1001 |
-
logger.debug("Webhook endpoint received POST request from Telegram.")
|
| 1002 |
try:
|
| 1003 |
update_data = request.get_json()
|
| 1004 |
if not update_data:
|
| 1005 |
-
|
| 1006 |
-
return Response('Bad Request: Expected JSON payload.', status=400)
|
| 1007 |
|
| 1008 |
update = Update.de_json(update_data, ptb_app.bot)
|
| 1009 |
-
logger.debug(f"Processing update_id: {update.update_id} via webhook route.")
|
| 1010 |
-
|
| 1011 |
-
# Let PTB's dispatcher handle the update asynchronously
|
| 1012 |
-
# This will now call the appropriate handler (e.g., handle_summary_type_callback)
|
| 1013 |
-
# which will *quickly* schedule the background task and return.
|
| 1014 |
await ptb_app.process_update(update)
|
| 1015 |
-
|
| 1016 |
-
logger.debug(f"Finished processing update_id: {update.update_id} in webhook handler (task scheduled).")
|
| 1017 |
-
# Return 200 OK immediately to Telegram
|
| 1018 |
return Response('ok', status=200)
|
| 1019 |
|
| 1020 |
-
except json.JSONDecodeError:
|
| 1021 |
-
logger.error("Failed to decode JSON from Telegram webhook request.", exc_info=True)
|
| 1022 |
-
return Response('Bad Request: Invalid JSON format.', status=400)
|
| 1023 |
except Exception as e:
|
| 1024 |
-
logger.error(f"
|
| 1025 |
-
return Response('Internal Server Error
|
| 1026 |
-
|
| 1027 |
|
| 1028 |
# --- Create Starlette ASGI Application ---
|
| 1029 |
app = Starlette(
|
|
@@ -1033,25 +502,10 @@ app = Starlette(
|
|
| 1033 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
| 1034 |
]
|
| 1035 |
)
|
| 1036 |
-
logger.info("Starlette ASGI application created
|
| 1037 |
-
|
| 1038 |
|
| 1039 |
# --- Development Server Execution Block ---
|
| 1040 |
if __name__ == '__main__':
|
| 1041 |
-
logger.warning("
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
logger.warning("This mode starts the Flask development server.")
|
| 1045 |
-
logger.warning("!!! IT DOES **NOT** RUN THE ASGI LIFESPAN !!!")
|
| 1046 |
-
logger.warning("!!! The Telegram Bot (PTB Application) WILL NOT INITIALIZE OR RUN !!!")
|
| 1047 |
-
logger.warning("This is suitable ONLY for verifying Flask routes locally.")
|
| 1048 |
-
logger.warning("For proper testing/deployment, use: uvicorn main:app --reload --port 8080")
|
| 1049 |
-
logger.warning("or via Gunicorn: gunicorn -c gunicorn.conf.py main:app")
|
| 1050 |
-
logger.warning("=" * 50)
|
| 1051 |
-
|
| 1052 |
-
if not TELEGRAM_TOKEN:
|
| 1053 |
-
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable missing. Aborting direct Flask start.")
|
| 1054 |
-
else:
|
| 1055 |
-
local_port = int(os.environ.get('PORT', 8080))
|
| 1056 |
-
logger.info(f"Starting Flask development server on http://0.0.0.0:{local_port}")
|
| 1057 |
-
flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
|
|
|
|
| 1 |
+
# main.py (Revised with background task connection fixes)
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import logging
|
| 5 |
import asyncio
|
| 6 |
import json
|
| 7 |
+
import html
|
| 8 |
+
import contextlib
|
| 9 |
+
import traceback
|
| 10 |
+
from typing import Optional
|
| 11 |
|
| 12 |
# --- Frameworks ---
|
| 13 |
+
from flask import Flask, request, Response
|
| 14 |
+
from starlette.applications import Starlette
|
| 15 |
+
from starlette.routing import Mount
|
| 16 |
+
from starlette.middleware.wsgi import WSGIMiddleware
|
| 17 |
|
| 18 |
# --- Telegram Bot ---
|
| 19 |
+
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
|
| 20 |
from telegram.ext import (
|
| 21 |
Application,
|
| 22 |
CommandHandler,
|
|
|
|
| 26 |
CallbackQueryHandler,
|
| 27 |
)
|
| 28 |
from telegram.constants import ParseMode
|
| 29 |
+
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest
|
| 30 |
from telegram.request import HTTPXRequest
|
| 31 |
|
| 32 |
# --- Other Libraries ---
|
|
|
|
| 34 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 35 |
import requests
|
| 36 |
from bs4 import BeautifulSoup
|
| 37 |
+
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
| 38 |
+
|
| 39 |
_apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
|
| 40 |
if _apify_token_exists:
|
| 41 |
from apify_client import ApifyClient
|
|
|
|
| 58 |
logger = logging.getLogger(__name__)
|
| 59 |
logger.info("Logging configured.")
|
| 60 |
|
| 61 |
+
# --- Global variable for PTB app ---
|
| 62 |
+
ptb_app: Optional[Application] = None
|
| 63 |
|
| 64 |
# --- Environment Variable Loading ---
|
| 65 |
logger.info("Attempting to load secrets...")
|
|
|
|
| 76 |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
| 77 |
logger.info("Secret loading attempt finished.")
|
| 78 |
|
| 79 |
+
# --- Retry Decorator for Bot Operations ---
|
| 80 |
+
def retry_bot_operation(func):
|
| 81 |
+
@retry(
|
| 82 |
+
stop=stop_after_attempt(3),
|
| 83 |
+
wait=wait_exponential(multiplier=1, min=1, max=10),
|
| 84 |
+
retry=retry_if_exception_type((NetworkError, RuntimeError)),
|
| 85 |
+
before_sleep=lambda retry_state: logger.warning(
|
| 86 |
+
f"Retrying bot operation due to {retry_state.outcome.exception()}. "
|
| 87 |
+
f"Attempt {retry_state.attempt_number}/3"
|
| 88 |
+
)
|
| 89 |
+
)
|
| 90 |
+
async def wrapper(*args, **kwargs):
|
| 91 |
+
try:
|
| 92 |
+
return await func(*args, **kwargs)
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"Operation failed after retries: {e}")
|
| 95 |
+
raise
|
| 96 |
+
return wrapper
|
| 97 |
|
| 98 |
+
# --- Helper Functions (unchanged from your original) ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
def is_youtube_url(url):
|
| 100 |
"""Checks if the URL is a valid YouTube video or shorts URL."""
|
| 101 |
youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
|
|
|
|
| 115 |
logger.warning(f"Could not extract YouTube ID from URL: {url}")
|
| 116 |
return None
|
| 117 |
|
| 118 |
+
# --- Content Fetching Functions (unchanged from your original) ---
|
| 119 |
+
# [Keep all your existing content fetching functions exactly as they were]
|
| 120 |
+
# get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript,
|
| 121 |
+
# get_website_content_via_requests, get_website_content_via_urltotext_api, generate_summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
# --- Revised Background Task Processing ---
|
| 124 |
async def process_summary_task(
|
| 125 |
user_id: int,
|
| 126 |
chat_id: int,
|
| 127 |
message_id_to_edit: int,
|
| 128 |
url: str,
|
| 129 |
summary_type: str,
|
| 130 |
+
bot_token: str # Now receiving token instead of bot instance
|
| 131 |
) -> None:
|
| 132 |
"""Handles the actual fetching and summarization in a background task."""
|
| 133 |
+
task_id = f"{user_id}-{message_id_to_edit}"
|
| 134 |
+
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
| 135 |
+
|
| 136 |
+
# Create a new bot instance for this task
|
| 137 |
+
bot = Bot(token=bot_token)
|
| 138 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
try:
|
| 140 |
+
# --- Inform User Processing Has Started ---
|
| 141 |
+
processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
|
| 142 |
+
status_message_sent_id = None
|
| 143 |
+
|
| 144 |
+
@retry_bot_operation
|
| 145 |
+
async def edit_or_send_status():
|
| 146 |
+
nonlocal status_message_sent_id, message_id_to_edit
|
| 147 |
+
try:
|
| 148 |
+
await bot.edit_message_text(
|
| 149 |
+
chat_id=chat_id,
|
| 150 |
+
message_id=message_id_to_edit,
|
| 151 |
+
text=processing_message_text
|
| 152 |
+
)
|
| 153 |
+
logger.debug(f"[Task {task_id}] Successfully edited message {message_id_to_edit}")
|
| 154 |
+
except (TimedOut, NetworkError, BadRequest) as e:
|
| 155 |
+
logger.warning(f"[Task {task_id}] Could not edit original message: {e}. Sending new status message.")
|
| 156 |
+
message_id_to_edit = None
|
| 157 |
+
status_message = await bot.send_message(
|
| 158 |
+
chat_id=chat_id,
|
| 159 |
+
text=processing_message_text
|
| 160 |
+
)
|
| 161 |
+
status_message_sent_id = status_message.message_id
|
| 162 |
+
logger.debug(f"[Task {task_id}] Sent new status message {status_message_sent_id}")
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
+
await edit_or_send_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
# --- Main Content Fetching and Summarization ---
|
| 167 |
+
content = None
|
| 168 |
+
user_feedback_message = None
|
| 169 |
+
success = False
|
| 170 |
|
| 171 |
+
try:
|
| 172 |
+
# Send 'typing' action
|
| 173 |
+
@retry_bot_operation
|
| 174 |
+
async def send_typing():
|
| 175 |
+
await bot.send_chat_action(chat_id=chat_id, action='typing')
|
| 176 |
+
|
| 177 |
+
await send_typing()
|
| 178 |
+
|
| 179 |
+
# --- Determine Content Type and Fetch ---
|
| 180 |
+
is_yt = is_youtube_url(url)
|
| 181 |
+
logger.debug(f"[Task {task_id}] URL is YouTube: {is_yt}")
|
| 182 |
+
|
| 183 |
+
if is_yt:
|
| 184 |
+
video_id = extract_youtube_id(url)
|
| 185 |
+
if video_id:
|
| 186 |
+
logger.info(f"[Task {task_id}] Fetching YouTube transcript for {video_id}")
|
| 187 |
+
content = await get_youtube_transcript(
|
| 188 |
+
video_id,
|
| 189 |
+
url,
|
| 190 |
+
SUPADATA_API_KEY,
|
| 191 |
+
APIFY_API_TOKEN
|
| 192 |
+
)
|
| 193 |
+
if not content:
|
| 194 |
+
user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video."
|
| 195 |
else:
|
| 196 |
+
logger.info(f"[Task {task_id}] Attempting website scrape for: {url}")
|
| 197 |
+
content = await get_website_content_via_requests(url)
|
| 198 |
+
if not content and URLTOTEXT_API_KEY:
|
| 199 |
+
await send_typing()
|
| 200 |
+
content = await get_website_content_via_urltotext_api(url, URLTOTEXT_API_KEY)
|
| 201 |
+
if not content:
|
| 202 |
+
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website."
|
| 203 |
+
|
| 204 |
+
# --- Generate Summary if Content Was Fetched ---
|
| 205 |
if content:
|
| 206 |
+
logger.info(f"[Task {task_id}] Generating '{summary_type}' summary")
|
| 207 |
+
await send_typing()
|
| 208 |
+
|
| 209 |
+
summary = await generate_summary(content, summary_type, OPENROUTER_API_KEY)
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
if summary.startswith("Error:") or summary.startswith("Sorry,"):
|
| 212 |
+
user_feedback_message = f"⚠️ {summary}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
else:
|
| 214 |
+
@retry_bot_operation
|
| 215 |
+
async def send_summary():
|
| 216 |
+
await bot.send_message(
|
| 217 |
+
chat_id=chat_id,
|
| 218 |
+
text=summary,
|
| 219 |
+
parse_mode=ParseMode.MARKDOWN,
|
| 220 |
+
link_preview_options={'is_disabled': True}
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
await send_summary()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
success = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
+
except Exception as e:
|
| 227 |
+
logger.error(f"[Task {task_id}] Error during processing: {e}", exc_info=True)
|
| 228 |
+
user_feedback_message = "❌ An unexpected error occurred while processing your request."
|
| 229 |
|
| 230 |
# --- Send Final Feedback Message if Processing Failed ---
|
| 231 |
if user_feedback_message and not success:
|
| 232 |
+
@retry_bot_operation
|
| 233 |
+
async def send_feedback():
|
| 234 |
await bot.send_message(chat_id=chat_id, text=user_feedback_message)
|
| 235 |
+
|
| 236 |
+
await send_feedback()
|
| 237 |
|
| 238 |
except Exception as e:
|
| 239 |
+
logger.error(f"[Task {task_id}] Critical error in task: {e}", exc_info=True)
|
|
|
|
| 240 |
try:
|
| 241 |
+
await bot.send_message(
|
| 242 |
+
chat_id=chat_id,
|
| 243 |
+
text="❌ A critical error occurred. Please try again later."
|
| 244 |
+
)
|
| 245 |
+
except Exception:
|
| 246 |
+
pass
|
| 247 |
finally:
|
| 248 |
# --- Clean up Status Message(s) ---
|
|
|
|
| 249 |
try:
|
| 250 |
if status_message_sent_id:
|
|
|
|
| 251 |
await bot.delete_message(chat_id=chat_id, message_id=status_message_sent_id)
|
| 252 |
+
elif message_id_to_edit and success:
|
| 253 |
+
await bot.delete_message(chat_id=chat_id, message_id=message_id_to_edit)
|
| 254 |
+
elif message_id_to_edit and not success:
|
| 255 |
+
final_error_text = user_feedback_message or "❌ An error occurred."
|
| 256 |
+
await bot.edit_message_text(
|
| 257 |
+
chat_id=chat_id,
|
| 258 |
+
message_id=message_id_to_edit,
|
| 259 |
+
text=final_error_text[:4090]
|
| 260 |
+
)
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.warning(f"[Task {task_id}] Cleanup error: {e}")
|
| 263 |
+
|
| 264 |
+
# Ensure bot session is closed
|
| 265 |
+
try:
|
| 266 |
+
await bot.session.close()
|
| 267 |
+
except Exception:
|
| 268 |
+
pass
|
|
|
|
|
|
|
| 269 |
|
| 270 |
+
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
| 271 |
|
| 272 |
# --- Telegram Bot Handlers ---
|
|
|
|
| 273 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 274 |
"""Handles the /start command."""
|
| 275 |
user = update.effective_user
|
| 276 |
if not user: return
|
| 277 |
+
logger.info(f"User {user.id} initiated /start.")
|
| 278 |
mention = user.mention_html() if user.username else user.first_name
|
| 279 |
start_message = (
|
| 280 |
f"👋 Hello {mention}!\n\n"
|
|
|
|
| 290 |
logger.info(f"User {user.id if user else '?'} requested /help.")
|
| 291 |
help_text = (
|
| 292 |
"**How to Use Me:**\n"
|
| 293 |
+
"1. Send me a direct link (URL) to a YouTube video or a web article.\n"
|
| 294 |
+
"2. I will ask you to choose the summary format: `Paragraph` or `Points`.\n"
|
| 295 |
+
"3. Click the button for your preferred format.\n"
|
| 296 |
+
"4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
|
| 297 |
"**Important Notes:**\n"
|
| 298 |
+
"- **YouTube:** Getting transcripts can sometimes fail if they are disabled or unavailable.\n"
|
| 299 |
+
"- **Websites:** Complex websites might not work perfectly.\n"
|
| 300 |
+
"- **AI Summaries:** The AI tries its best to be accurate.\n\n"
|
|
|
|
| 301 |
"Just send a link to get started!"
|
| 302 |
)
|
| 303 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
|
|
|
| 314 |
|
| 315 |
if match:
|
| 316 |
url = match.group(0)
|
| 317 |
+
logger.info(f"User {user.id} sent URL: {url}")
|
| 318 |
context.user_data['url_to_summarize'] = url
|
|
|
|
| 319 |
|
| 320 |
keyboard = [
|
| 321 |
[
|
|
|
|
| 330 |
parse_mode=ParseMode.MARKDOWN,
|
| 331 |
link_preview_options={'is_disabled': True}
|
| 332 |
)
|
| 333 |
+
elif not message_text.startswith('/'):
|
| 334 |
+
await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
|
|
|
|
|
|
|
| 335 |
|
| 336 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 337 |
+
"""Handles button presses for summary type selection."""
|
| 338 |
query = update.callback_query
|
| 339 |
if not query or not query.from_user or not query.message:
|
| 340 |
+
try:
|
| 341 |
+
await query.answer()
|
| 342 |
+
except:
|
| 343 |
+
pass
|
|
|
|
| 344 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
+
await query.answer() # Acknowledge the button press immediately
|
| 347 |
+
|
| 348 |
+
user = query.from_user
|
| 349 |
summary_type = query.data
|
| 350 |
url = context.user_data.get('url_to_summarize')
|
| 351 |
+
query_id = query.id
|
| 352 |
|
| 353 |
+
logger.info(f"User {user.id} chose summary type '{summary_type}'")
|
| 354 |
|
| 355 |
if not url:
|
| 356 |
+
logger.warning(f"No URL found for user {user.id}")
|
| 357 |
try:
|
| 358 |
await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
|
| 359 |
+
except Exception as e:
|
| 360 |
+
logger.error(f"Failed to edit message: {e}")
|
|
|
|
|
|
|
|
|
|
| 361 |
return
|
| 362 |
|
| 363 |
+
# Clear the URL from context
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
context.user_data.pop('url_to_summarize', None)
|
|
|
|
| 365 |
|
| 366 |
+
# Schedule background task with token instead of bot instance
|
|
|
|
| 367 |
asyncio.create_task(
|
| 368 |
process_summary_task(
|
| 369 |
+
user_id=user.id,
|
| 370 |
+
chat_id=query.message.chat_id,
|
| 371 |
+
message_id_to_edit=query.message.message_id,
|
| 372 |
url=url,
|
| 373 |
summary_type=summary_type,
|
| 374 |
+
bot_token=TELEGRAM_TOKEN
|
| 375 |
),
|
| 376 |
+
name=f"SummaryTask-{user.id}-{query.message.message_id}"
|
| 377 |
)
|
| 378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 380 |
"""Log Errors caused by Updates or background tasks."""
|
|
|
|
| 381 |
if context.error:
|
| 382 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
# --- Bot Setup Function ---
|
| 385 |
async def setup_bot_config() -> Application:
|
| 386 |
+
"""Configures the PTB Application."""
|
| 387 |
logger.info("Configuring Telegram Application...")
|
| 388 |
if not TELEGRAM_TOKEN:
|
|
|
|
| 389 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
custom_request = HTTPXRequest(
|
| 392 |
+
connect_timeout=10.0,
|
| 393 |
+
read_timeout=30.0,
|
| 394 |
+
write_timeout=30.0,
|
| 395 |
+
pool_timeout=30.0,
|
| 396 |
http_version="1.1"
|
| 397 |
)
|
| 398 |
|
| 399 |
+
application = (
|
| 400 |
+
Application.builder()
|
| 401 |
+
.token(TELEGRAM_TOKEN)
|
| 402 |
+
.request(custom_request)
|
| 403 |
+
.build()
|
| 404 |
+
)
|
| 405 |
|
| 406 |
application.add_handler(CommandHandler("start", start))
|
| 407 |
application.add_handler(CommandHandler("help", help_command))
|
| 408 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
| 409 |
+
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
| 410 |
application.add_error_handler(error_handler)
|
| 411 |
|
| 412 |
logger.info("Telegram application handlers configured.")
|
|
|
|
| 421 |
|
| 422 |
try:
|
| 423 |
ptb_app = await setup_bot_config()
|
|
|
|
| 424 |
await ptb_app.initialize()
|
| 425 |
+
await ptb_app.start()
|
|
|
|
| 426 |
|
| 427 |
+
bot_info = await ptb_app.bot.get_me()
|
| 428 |
+
logger.info(f"Bot started: @{bot_info.username}")
|
|
|
|
| 429 |
|
| 430 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
| 431 |
if WEBHOOK_URL_BASE:
|
| 432 |
+
if not WEBHOOK_URL_BASE.startswith("https://"):
|
| 433 |
+
WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
|
| 434 |
webhook_path = "/webhook"
|
| 435 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
| 436 |
|
| 437 |
+
logger.info(f"Setting webhook to: {full_webhook_url}")
|
| 438 |
+
await asyncio.sleep(2.0)
|
| 439 |
try:
|
| 440 |
+
await ptb_app.bot.set_webhook(
|
| 441 |
url=full_webhook_url,
|
| 442 |
allowed_updates=Update.ALL_TYPES,
|
| 443 |
+
drop_pending_updates=True
|
| 444 |
)
|
| 445 |
+
webhook_info = await ptb_app.bot.get_webhook_info()
|
| 446 |
+
logger.info(f"Webhook set: {webhook_info}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
except Exception as e:
|
| 448 |
+
logger.error(f"Failed to set webhook: {e}")
|
|
|
|
|
|
|
| 449 |
|
| 450 |
+
logger.info("ASGI Lifespan: Startup complete.")
|
| 451 |
+
yield
|
| 452 |
|
| 453 |
except Exception as startup_err:
|
| 454 |
+
logger.critical(f"Startup error: {startup_err}", exc_info=True)
|
|
|
|
|
|
|
| 455 |
raise
|
| 456 |
finally:
|
|
|
|
| 457 |
logger.info("ASGI Lifespan: Shutdown sequence initiated...")
|
| 458 |
if ptb_app:
|
| 459 |
+
try:
|
| 460 |
+
await ptb_app.stop()
|
| 461 |
+
await ptb_app.shutdown()
|
| 462 |
+
logger.info("PTB Application shut down gracefully.")
|
| 463 |
+
except Exception as shutdown_err:
|
| 464 |
+
logger.error(f"Shutdown error: {shutdown_err}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
| 466 |
|
| 467 |
+
# --- Flask App Setup ---
|
|
|
|
| 468 |
flask_core_app = Flask(__name__)
|
|
|
|
| 469 |
|
|
|
|
| 470 |
@flask_core_app.route('/')
|
| 471 |
def index():
|
| 472 |
"""Basic health check endpoint."""
|
| 473 |
+
bot_status = "Unknown"
|
|
|
|
| 474 |
if ptb_app and ptb_app.bot:
|
| 475 |
+
bot_status = f"Running (@{ptb_app.bot.username})"
|
| 476 |
+
return f"Telegram Bot Summarizer - Status: {bot_status}"
|
|
|
|
| 477 |
|
| 478 |
@flask_core_app.route('/webhook', methods=['POST'])
|
| 479 |
async def webhook() -> Response:
|
| 480 |
"""Webhook endpoint called by Telegram."""
|
|
|
|
|
|
|
| 481 |
if not ptb_app:
|
| 482 |
+
return Response('Bot not initialized', status=503)
|
|
|
|
| 483 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
try:
|
| 485 |
update_data = request.get_json()
|
| 486 |
if not update_data:
|
| 487 |
+
return Response('Bad Request', status=400)
|
|
|
|
| 488 |
|
| 489 |
update = Update.de_json(update_data, ptb_app.bot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
await ptb_app.process_update(update)
|
|
|
|
|
|
|
|
|
|
| 491 |
return Response('ok', status=200)
|
| 492 |
|
|
|
|
|
|
|
|
|
|
| 493 |
except Exception as e:
|
| 494 |
+
logger.error(f"Webhook error: {e}")
|
| 495 |
+
return Response('Internal Server Error', status=500)
|
|
|
|
| 496 |
|
| 497 |
# --- Create Starlette ASGI Application ---
|
| 498 |
app = Starlette(
|
|
|
|
| 502 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
| 503 |
]
|
| 504 |
)
|
| 505 |
+
logger.info("Starlette ASGI application created.")
|
|
|
|
| 506 |
|
| 507 |
# --- Development Server Execution Block ---
|
| 508 |
if __name__ == '__main__':
|
| 509 |
+
logger.warning("Running in development mode (Flask server only)")
|
| 510 |
+
local_port = int(os.environ.get('PORT', 8080))
|
| 511 |
+
flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|