Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# main.py (Corrected SyntaxError at line 424 - Now with Gemini 2.0 as primary)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
@@ -7,6 +7,7 @@ import json
|
|
7 |
import html
|
8 |
import contextlib
|
9 |
import traceback
|
|
|
10 |
from typing import Optional, Dict, Any, Tuple
|
11 |
|
12 |
# --- Frameworks ---
|
@@ -81,11 +82,12 @@ def get_secret(secret_name):
|
|
81 |
|
82 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
83 |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Now Fallback
|
84 |
-
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
|
85 |
-
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
86 |
-
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
|
|
87 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
88 |
-
GEMINI_API_KEY = get_secret('GEMINI_API_KEY')
|
89 |
|
90 |
# Models (User can still configure via env vars)
|
91 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
@@ -95,6 +97,7 @@ GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001") # Primary
|
|
95 |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
96 |
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.")
|
97 |
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.")
|
|
|
98 |
|
99 |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY)
|
100 |
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.")
|
@@ -104,9 +107,9 @@ _openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
|
|
104 |
if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.")
|
105 |
|
106 |
|
107 |
-
if not URLTOTEXT_API_KEY:
|
108 |
-
if not SUPADATA_API_KEY:
|
109 |
-
if not APIFY_API_TOKEN:
|
110 |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.")
|
111 |
|
112 |
logger.info("Secret loading and configuration check finished.")
|
@@ -114,6 +117,8 @@ logger.info(f"Using Gemini Model (Primary): {GEMINI_MODEL if _gemini_primary_ena
|
|
114 |
logger.info(f"Using OpenRouter Model (Fallback): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
|
115 |
logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}")
|
116 |
_apify_token_exists = bool(APIFY_API_TOKEN)
|
|
|
|
|
117 |
|
118 |
|
119 |
if _gemini_primary_enabled:
|
@@ -143,26 +148,8 @@ def extract_youtube_id(url):
|
|
143 |
|
144 |
|
145 |
# --- Content Fetching Functions ---
|
146 |
-
# (fetch_url_content_for_scrape, get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript, get_website_content, get_website_content_via_api remain the same as previous version)
|
147 |
-
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
148 |
-
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
149 |
-
try:
|
150 |
-
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
151 |
-
logger.debug(f"[Web Scrape] Sending request to {url}")
|
152 |
-
response = await client.get(url)
|
153 |
-
logger.debug(f"[Web Scrape] Received response {response.status_code} from {url}")
|
154 |
-
response.raise_for_status()
|
155 |
-
content_type = response.headers.get('content-type', '').lower()
|
156 |
-
if 'html' not in content_type: logger.warning(f"[Web Scrape] Non-HTML content type from {url}: {content_type}"); return None
|
157 |
-
try: return response.text
|
158 |
-
except Exception as e: logger.error(f"[Web Scrape] Error decoding response for {url}: {e}"); return None
|
159 |
-
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape] HTTP error {e.response.status_code} fetching {url}: {e}")
|
160 |
-
except httpx.TimeoutException: logger.error(f"[Web Scrape] Timeout error fetching {url}")
|
161 |
-
except httpx.TooManyRedirects: logger.error(f"[Web Scrape] Too many redirects fetching {url}")
|
162 |
-
except httpx.RequestError as e: logger.error(f"[Web Scrape] Request error fetching {url}: {e}")
|
163 |
-
except Exception as e: logger.error(f"[Web Scrape] Unexpected error fetching {url}: {e}", exc_info=True)
|
164 |
-
return None
|
165 |
|
|
|
166 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
167 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
168 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
@@ -250,6 +237,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
250 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
251 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
252 |
transcript_text = None
|
|
|
253 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
254 |
try:
|
255 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
@@ -261,6 +249,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
261 |
if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found.")
|
262 |
elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled.")
|
263 |
transcript_text = None
|
|
|
264 |
if transcript_text is None:
|
265 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
266 |
if SUPADATA_API_KEY:
|
@@ -268,6 +257,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
268 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
269 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
270 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
|
|
271 |
if transcript_text is None:
|
272 |
logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
|
273 |
if APIFY_API_TOKEN:
|
@@ -275,58 +265,196 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
275 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text
|
276 |
else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
|
277 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
|
|
278 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
279 |
return transcript_text
|
280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
async def get_website_content(url: str) -> Optional[str]:
|
|
|
282 |
if not url: logger.error("get_website_content: No URL"); return None
|
283 |
-
logger.info(f"[Primary
|
284 |
html_content = await fetch_url_content_for_scrape(url)
|
285 |
if not html_content: return None
|
286 |
try:
|
287 |
def parse_html(content):
|
288 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
289 |
-
|
|
|
|
|
|
|
290 |
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
|
291 |
target_element = main_content if main_content else soup.body
|
292 |
-
if not target_element: logger.warning(f"[Primary
|
|
|
293 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
294 |
text = " ".join(lines)
|
295 |
-
if not text: logger.warning(f"[Primary
|
296 |
return text
|
|
|
297 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
298 |
-
if text_content: logger.info(f"[Primary
|
299 |
else: return None
|
300 |
-
except Exception as e: logger.error(f"[Primary
|
301 |
|
302 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
303 |
-
|
304 |
-
if not
|
305 |
-
logger.
|
|
|
306 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
307 |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False }
|
308 |
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
|
309 |
try:
|
310 |
async with httpx.AsyncClient(timeout=45.0) as client:
|
311 |
-
logger.debug(f"[Fallback
|
312 |
response = await client.post(api_endpoint, headers=headers, json=payload)
|
313 |
-
logger.debug(f"[Fallback
|
314 |
if response.status_code == 200:
|
315 |
try:
|
316 |
data = response.json()
|
317 |
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning")
|
318 |
-
if warning: logger.warning(f"[Fallback
|
319 |
-
if content: logger.info(f"[Fallback
|
320 |
-
else: logger.warning(f"[Fallback
|
321 |
-
except json.JSONDecodeError: logger.error(f"[Fallback
|
322 |
-
except Exception as e: logger.error(f"[Fallback
|
323 |
-
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Fallback
|
324 |
-
else: logger.error(f"[Fallback
|
325 |
-
except httpx.TimeoutException: logger.error(f"[Fallback
|
326 |
-
except httpx.RequestError as e: logger.error(f"[Fallback
|
327 |
-
except Exception as e: logger.error(f"[Fallback
|
328 |
-
|
329 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
332 |
"""Internal function to call Gemini API. Returns (summary, error_message)."""
|
@@ -412,9 +540,6 @@ async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Opt
|
|
412 |
|
413 |
if summary:
|
414 |
logger.info(f"[Gemini Primary] Success generating summary. Output len: {len(summary)}");
|
415 |
-
# Escape Markdown for Telegram only if necessary (Removed escaping as it might conflict with plain heading)
|
416 |
-
# escaped_summary = summary.strip().replace('_', r'\_').replace('*', r'\*').replace('[', r'\[').replace('`', r'\`')
|
417 |
-
# Let's see if the AI respects the instruction without further escaping. If not, we might need selective escaping.
|
418 |
return summary.strip(), None
|
419 |
else:
|
420 |
finish_reason = response.candidates[0].finish_reason if response.candidates else 'N/A'
|
@@ -493,8 +618,6 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
|
|
493 |
summary = message.get("content")
|
494 |
if summary:
|
495 |
logger.info(f"[OpenRouter Fallback] Success via OpenRouter. Output len: {len(summary)}")
|
496 |
-
# Escape Markdown for Telegram only if necessary (Removed escaping)
|
497 |
-
# escaped_summary = summary.strip().replace('_', r'\_').replace('*', r'\*').replace('[', r'\[').replace('`', r'\`')
|
498 |
return summary.strip(), None
|
499 |
else:
|
500 |
logger.warning(f"[OpenRouter Fallback] OpenRouter success but content empty. Resp: {data}")
|
@@ -585,14 +708,23 @@ async def generate_summary(text: str, summary_type: str) -> str:
|
|
585 |
return "Sorry, an unknown error occurred during summary generation after trying all available models."
|
586 |
|
587 |
|
588 |
-
#
|
|
|
589 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
|
|
590 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
591 |
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None
|
592 |
try: background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 ); bot = Bot(token=bot_token, request=background_request)
|
593 |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return
|
594 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
595 |
try:
|
|
|
596 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nThis might take a moment..."
|
597 |
if status_message_id:
|
598 |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
@@ -602,62 +734,118 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
602 |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN )
|
603 |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
|
604 |
else: raise RuntimeError("Failed to send status message after retries.")
|
605 |
-
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise
|
|
|
606 |
try:
|
|
|
607 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
608 |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
|
|
|
609 |
if is_youtube:
|
|
|
610 |
video_id = extract_youtube_id(url)
|
611 |
-
if video_id: content = await get_youtube_transcript(video_id, url)
|
612 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
613 |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
614 |
else:
|
|
|
|
|
|
|
|
|
615 |
content = await get_website_content(url)
|
|
|
|
|
616 |
if not content:
|
617 |
-
logger.warning(f"[Task {task_id}] Primary web scrape failed for {url}. Trying
|
618 |
-
|
619 |
-
if URLTOTEXT_API_KEY:
|
620 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
621 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
622 |
-
if not content:
|
623 |
-
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
624 |
if content:
|
625 |
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.")
|
626 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
627 |
-
final_summary = await generate_summary(content, summary_type) #
|
628 |
-
|
|
|
|
|
|
|
629 |
else:
|
630 |
-
|
631 |
-
|
632 |
-
|
|
|
633 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
|
634 |
-
for part in summary_parts[1:]:
|
635 |
-
|
636 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
637 |
except Exception as e:
|
638 |
-
|
|
|
|
|
639 |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
|
640 |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.")
|
|
|
641 |
except Exception as outer_e:
|
|
|
642 |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True)
|
643 |
try:
|
644 |
-
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred." )
|
645 |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.")
|
646 |
finally:
|
|
|
|
|
647 |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
|
648 |
if delete_target_id and bot:
|
649 |
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id); logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}")
|
650 |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}")
|
|
|
651 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
652 |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
|
653 |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
|
654 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
655 |
|
|
|
|
|
|
|
656 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
657 |
user = update.effective_user; mention = user.mention_html()
|
658 |
if not user or not update.message: return
|
659 |
logger.info(f"User {user.id} used /start.")
|
660 |
-
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
|
661 |
|
662 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
663 |
user = update.effective_user
|
@@ -671,41 +859,53 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
671 |
if not update.message or not update.message.text: return
|
672 |
url = update.message.text.strip(); user = update.effective_user
|
673 |
if not user: return
|
674 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
675 |
logger.info(f"User {user.id} sent potential URL: {url}")
|
676 |
context.user_data['url_to_summarize'] = url; context.user_data['original_message_id'] = update.message.message_id
|
677 |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]]
|
678 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
679 |
-
await update.message.reply_text( f"Okay, I see this link:\n{url}
|
680 |
|
681 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
682 |
query = update.callback_query
|
683 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
684 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
685 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}")
|
686 |
-
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True)
|
|
|
687 |
url = context.user_data.get('url_to_summarize'); message_id_to_edit = query.message.message_id
|
688 |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}")
|
|
|
689 |
if not url:
|
690 |
-
logger.warning(f"No URL in context for user {user.id} (cb {query_id}).")
|
691 |
-
try:
|
|
|
|
|
|
|
|
|
692 |
except Exception as e:
|
693 |
logger.error(f"Failed edit 'URL not found' msg: {e}")
|
694 |
-
|
695 |
-
except Exception: pass
|
696 |
return
|
697 |
|
|
|
698 |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
|
699 |
|
700 |
-
global TELEGRAM_TOKEN,
|
701 |
if not TELEGRAM_TOKEN:
|
702 |
-
logger.critical("TG TOKEN missing!")
|
703 |
-
try: await query.edit_message_text(text="❌ Bot
|
704 |
except Exception: pass
|
705 |
return
|
706 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
707 |
-
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid!")
|
708 |
-
try: await query.edit_message_text(text="❌ AI
|
709 |
except Exception: pass
|
710 |
return
|
711 |
elif not _gemini_primary_enabled:
|
@@ -715,22 +915,62 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
715 |
logger.warning("Fallback AI (OpenRouter) is unavailable.")
|
716 |
# No need to inform user unless primary fails later
|
717 |
|
718 |
-
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}")
|
719 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
720 |
|
721 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
722 |
-
|
723 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
724 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
725 |
|
726 |
async def setup_bot_config() -> Application:
|
727 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
728 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
729 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
730 |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build()
|
731 |
-
|
732 |
-
application.add_handler(
|
733 |
-
application.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
|
735 |
@contextlib.asynccontextmanager
|
736 |
async def lifespan(app: Starlette):
|
@@ -745,69 +985,149 @@ async def lifespan(app: Starlette):
|
|
745 |
try:
|
746 |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
|
747 |
else: logger.warning("Failed delete webhook (API returned False).")
|
748 |
-
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1)
|
749 |
-
|
|
|
|
|
|
|
|
|
750 |
if space_host:
|
751 |
-
protocol = "https"
|
|
|
|
|
|
|
752 |
if full_webhook_url:
|
753 |
-
logger.info(f"Setting webhook: {full_webhook_url}")
|
754 |
-
|
755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
756 |
try:
|
757 |
-
await ptb_app.bot.set_webhook(**set_webhook_args)
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
765 |
except Exception as startup_err:
|
766 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
767 |
if ptb_app:
|
768 |
if ptb_app.running: await ptb_app.stop()
|
769 |
await ptb_app.shutdown()
|
770 |
-
raise
|
771 |
finally:
|
|
|
772 |
logger.info("ASGI Lifespan: Shutdown initiated...")
|
773 |
if ptb_app:
|
774 |
-
if ptb_app.running: logger.info("Stopping PTB..."); await ptb_app.stop()
|
775 |
-
logger.info("Shutting down PTB..."); await ptb_app.shutdown(); logger.info("PTB Application shut down.")
|
776 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
777 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
778 |
|
779 |
async def health_check(request: Request) -> PlainTextResponse:
|
|
|
780 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
|
781 |
bot_status = "Not Initialized"
|
782 |
if ptb_app and ptb_app.bot:
|
783 |
try:
|
784 |
-
if ptb_app.running:
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
791 |
|
792 |
async def telegram_webhook(request: Request) -> Response:
|
|
|
793 |
global WEBHOOK_SECRET
|
794 |
-
if not ptb_app: logger.error("Webhook
|
795 |
-
if not ptb_app.running: logger.warning("Webhook
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
796 |
try:
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
802 |
-
except json.JSONDecodeError:
|
803 |
-
|
804 |
-
|
805 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
806 |
logger.info("Starlette ASGI application created with native routes.")
|
807 |
|
|
|
808 |
if __name__ == '__main__':
|
809 |
import uvicorn
|
810 |
-
logger.warning("Running in development mode using Uvicorn directly")
|
|
|
811 |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
|
812 |
-
local_port = int(os.environ.get('PORT', 8080))
|
813 |
-
uvicorn.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main.py (Corrected SyntaxError at line 424 - Now with Gemini 2.0 as primary AND new scraping fallbacks)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
|
|
7 |
import html
|
8 |
import contextlib
|
9 |
import traceback
|
10 |
+
import urllib.parse # Added for URL encoding
|
11 |
from typing import Optional, Dict, Any, Tuple
|
12 |
|
13 |
# --- Frameworks ---
|
|
|
82 |
|
83 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
84 |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Now Fallback
|
85 |
+
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Scrape Fallback 1
|
86 |
+
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # YT Fallback 1
|
87 |
+
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # YT Fallback 2
|
88 |
+
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY') # Scrape Fallbacks 2 & 3 (NEW)
|
89 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
90 |
+
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Now Primary Summarizer
|
91 |
|
92 |
# Models (User can still configure via env vars)
|
93 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
|
|
97 |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
98 |
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.")
|
99 |
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.")
|
100 |
+
if not RAPIDAPI_KEY: logger.warning("⚠️ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks will be unavailable.") # New check
|
101 |
|
102 |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY)
|
103 |
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.")
|
|
|
107 |
if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.")
|
108 |
|
109 |
|
110 |
+
if not URLTOTEXT_API_KEY: logger.warning("Optional secret 'URLTOTEXT_API_KEY' not found. First web scraping fallback unavailable.") # Adjusted warning
|
111 |
+
if not SUPADATA_API_KEY: logger.warning("Optional secret 'SUPADATA_API_KEY' not found. First YT transcript fallback unavailable.") # Adjusted warning
|
112 |
+
if not APIFY_API_TOKEN: logger.warning("Optional secret 'APIFY_API_TOKEN' not found. Second YT transcript fallback unavailable.") # Adjusted warning
|
113 |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.")
|
114 |
|
115 |
logger.info("Secret loading and configuration check finished.")
|
|
|
117 |
logger.info(f"Using OpenRouter Model (Fallback): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
|
118 |
logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}")
|
119 |
_apify_token_exists = bool(APIFY_API_TOKEN)
|
120 |
+
_urltotext_key_exists = bool(URLTOTEXT_API_KEY)
|
121 |
+
_rapidapi_key_exists = bool(RAPIDAPI_KEY)
|
122 |
|
123 |
|
124 |
if _gemini_primary_enabled:
|
|
|
148 |
|
149 |
|
150 |
# --- Content Fetching Functions ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
+
# --- YouTube Transcript Fetching (Unchanged) ---
|
153 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
154 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
155 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
|
|
237 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
238 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
239 |
transcript_text = None
|
240 |
+
# Method 1: youtube-transcript-api (Primary)
|
241 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
242 |
try:
|
243 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
|
|
249 |
if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found.")
|
250 |
elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled.")
|
251 |
transcript_text = None
|
252 |
+
# Method 2: Supadata (Fallback 1)
|
253 |
if transcript_text is None:
|
254 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
255 |
if SUPADATA_API_KEY:
|
|
|
257 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
258 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
259 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
260 |
+
# Method 3: Apify (Fallback 2)
|
261 |
if transcript_text is None:
|
262 |
logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...")
|
263 |
if APIFY_API_TOKEN:
|
|
|
265 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text
|
266 |
else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
|
267 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
268 |
+
# Final Result
|
269 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
270 |
return transcript_text
|
271 |
|
272 |
+
# --- Website Content Fetching (MODIFIED SECTION) ---
|
273 |
+
|
274 |
+
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
275 |
+
"""Directly fetches URL content using httpx. (Primary Web Method - Fetching part)"""
|
276 |
+
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
277 |
+
try:
|
278 |
+
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
279 |
+
logger.debug(f"[Web Scrape Direct] Sending request to {url}")
|
280 |
+
response = await client.get(url)
|
281 |
+
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
|
282 |
+
response.raise_for_status()
|
283 |
+
content_type = response.headers.get('content-type', '').lower()
|
284 |
+
if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type from {url}: {content_type}"); return None
|
285 |
+
try: return response.text
|
286 |
+
except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response for {url}: {e}"); return None
|
287 |
+
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
|
288 |
+
except httpx.TimeoutException: logger.error(f"[Web Scrape Direct] Timeout error fetching {url}")
|
289 |
+
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Direct] Too many redirects fetching {url}")
|
290 |
+
except httpx.RequestError as e: logger.error(f"[Web Scrape Direct] Request error fetching {url}: {e}")
|
291 |
+
except Exception as e: logger.error(f"[Web Scrape Direct] Unexpected error fetching {url}: {e}", exc_info=True)
|
292 |
+
return None
|
293 |
+
|
294 |
async def get_website_content(url: str) -> Optional[str]:
|
295 |
+
"""Primary method: Fetches HTML directly and parses with BeautifulSoup."""
|
296 |
if not url: logger.error("get_website_content: No URL"); return None
|
297 |
+
logger.info(f"[Web Scrape Primary] Fetching website content for: {url}")
|
298 |
html_content = await fetch_url_content_for_scrape(url)
|
299 |
if not html_content: return None
|
300 |
try:
|
301 |
def parse_html(content):
|
302 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
303 |
+
# Remove common non-content tags
|
304 |
+
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure"]):
|
305 |
+
element.extract()
|
306 |
+
# Try to find main content areas
|
307 |
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
|
308 |
target_element = main_content if main_content else soup.body
|
309 |
+
if not target_element: logger.warning(f"[Web Scrape Primary] Could not find body/main for parsing {url}"); return None
|
310 |
+
# Get text, clean up whitespace, join lines
|
311 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
312 |
text = " ".join(lines)
|
313 |
+
if not text: logger.warning(f"[Web Scrape Primary] Extracted text empty after clean for {url}"); return None
|
314 |
return text
|
315 |
+
# Run parsing in a separate thread to avoid blocking
|
316 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
317 |
+
if text_content: logger.info(f"[Web Scrape Primary] Success scrape for {url} (final len: {len(text_content)})"); return text_content
|
318 |
else: return None
|
319 |
+
except Exception as e: logger.error(f"[Web Scrape Primary] Error scraping/parsing {url}: {e}", exc_info=True); return None
|
320 |
|
321 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
322 |
+
"""Fallback 1: Fetches website content using urltotext.com API."""
|
323 |
+
if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None
|
324 |
+
if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None
|
325 |
+
logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API")
|
326 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
327 |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False }
|
328 |
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
|
329 |
try:
|
330 |
async with httpx.AsyncClient(timeout=45.0) as client:
|
331 |
+
logger.debug(f"[Web Scrape Fallback 1] Sending request to urltotext.com API for {url}")
|
332 |
response = await client.post(api_endpoint, headers=headers, json=payload)
|
333 |
+
logger.debug(f"[Web Scrape Fallback 1] Received status {response.status_code} from urltotext.com API for {url}")
|
334 |
if response.status_code == 200:
|
335 |
try:
|
336 |
data = response.json()
|
337 |
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning")
|
338 |
+
if warning: logger.warning(f"[Web Scrape Fallback 1] urltotext.com API Warning for {url}: {warning}")
|
339 |
+
if content and isinstance(content, str): logger.info(f"[Web Scrape Fallback 1] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip()
|
340 |
+
else: logger.warning(f"[Web Scrape Fallback 1] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
|
341 |
+
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 1] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
|
342 |
+
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
|
343 |
+
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Web Scrape Fallback 1] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
344 |
+
else: logger.error(f"[Web Scrape Fallback 1] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
345 |
+
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout connecting to urltotext.com API for {url}"); return None
|
346 |
+
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 1] Request error connecting to urltotext.com API for {url}: {e}"); return None
|
347 |
+
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
348 |
+
|
349 |
+
# --- NEW Fallback Functions ---
|
350 |
+
|
351 |
+
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
|
352 |
+
"""Fallback 2 (NEW): Fetches website content using Scraper's Proxy Parser via RapidAPI."""
|
353 |
+
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
|
354 |
+
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
|
355 |
+
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
|
356 |
+
|
357 |
+
api_host = "scrapers-proxy2.p.rapidapi.com"
|
358 |
+
encoded_url = urllib.parse.quote(url, safe='') # URL Encode the target URL
|
359 |
+
api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
|
360 |
+
headers = {
|
361 |
+
"x-rapidapi-host": api_host,
|
362 |
+
"x-rapidapi-key": api_key
|
363 |
+
}
|
364 |
+
try:
|
365 |
+
async with httpx.AsyncClient(timeout=40.0) as client: # Increased timeout slightly
|
366 |
+
logger.debug(f"[Web Scrape Fallback 2] Sending GET request to {api_host} for {url}")
|
367 |
+
response = await client.get(api_endpoint, headers=headers)
|
368 |
+
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}")
|
369 |
+
|
370 |
+
if response.status_code == 200:
|
371 |
+
try:
|
372 |
+
data = response.json()
|
373 |
+
# Try to extract content, potentially combining title and content
|
374 |
+
content = data.get("content")
|
375 |
+
title = data.get("title")
|
376 |
+
extracted_text = ""
|
377 |
+
if title and isinstance(title, str): extracted_text += title.strip() + ". "
|
378 |
+
if content and isinstance(content, str): extracted_text += content.strip()
|
379 |
+
|
380 |
+
if extracted_text:
|
381 |
+
logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Parser API for {url}. Len: {len(extracted_text)}")
|
382 |
+
return extracted_text
|
383 |
+
else:
|
384 |
+
logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy API success but content/title empty/invalid for {url}. Response keys: {list(data.keys())}")
|
385 |
+
return None
|
386 |
+
except json.JSONDecodeError:
|
387 |
+
logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}")
|
388 |
+
return None
|
389 |
+
except Exception as e:
|
390 |
+
logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True)
|
391 |
+
return None
|
392 |
+
# Handle RapidAPI specific errors if known, otherwise general errors
|
393 |
+
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
394 |
+
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
|
395 |
+
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None
|
396 |
+
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 2] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None # API itself failed
|
397 |
+
else: logger.error(f"[Web Scrape Fallback 2] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
|
398 |
+
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 2] Timeout connecting to {api_host} API for {url}"); return None
|
399 |
+
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 2] Request error connecting to {api_host} API for {url}: {e}"); return None
|
400 |
+
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
401 |
+
|
402 |
+
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
|
403 |
+
"""Fallback 3 (NEW): Fetches website content using AI Web Scraper via RapidAPI."""
|
404 |
+
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
|
405 |
+
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
|
406 |
+
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API")
|
407 |
+
|
408 |
+
api_host = "ai-web-scraper.p.rapidapi.com"
|
409 |
+
api_endpoint = f"https://{api_host}/extract_content/v1"
|
410 |
+
headers = {
|
411 |
+
'Content-Type': 'application/x-www-form-urlencoded',
|
412 |
+
'x-rapidapi-host': api_host,
|
413 |
+
'x-rapidapi-key': api_key
|
414 |
+
}
|
415 |
+
# Data needs to be form-encoded, httpx handles this with `data=` param
|
416 |
+
payload = {'url': url}
|
417 |
+
|
418 |
+
try:
|
419 |
+
async with httpx.AsyncClient(timeout=45.0) as client: # Slightly longer timeout for potential AI processing
|
420 |
+
logger.debug(f"[Web Scrape Fallback 3] Sending POST request to {api_host} for {url}")
|
421 |
+
response = await client.post(api_endpoint, headers=headers, data=payload)
|
422 |
+
logger.debug(f"[Web Scrape Fallback 3] Received status {response.status_code} from {api_host} for {url}")
|
423 |
+
|
424 |
+
if response.status_code == 200:
|
425 |
+
try:
|
426 |
+
data = response.json()
|
427 |
+
# Infer response structure - Try common keys for content
|
428 |
+
content = None
|
429 |
+
if isinstance(data, dict):
|
430 |
+
content = data.get("content") or data.get("text") or data.get("extracted_text") or data.get("result")
|
431 |
+
# If it's a simple string response directly
|
432 |
+
elif isinstance(data, str):
|
433 |
+
content = data
|
434 |
+
|
435 |
+
if content and isinstance(content, str):
|
436 |
+
logger.info(f"[Web Scrape Fallback 3] Success via AI Web Scraper API for {url}. Len: {len(content)}")
|
437 |
+
return content.strip()
|
438 |
+
else:
|
439 |
+
logger.warning(f"[Web Scrape Fallback 3] AI Web Scraper API success but content empty/invalid format for {url}. Response type: {type(data)}, Keys: {list(data.keys()) if isinstance(data, dict) else 'N/A'}")
|
440 |
+
return None
|
441 |
+
except json.JSONDecodeError:
|
442 |
+
logger.error(f"[Web Scrape Fallback 3] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}")
|
443 |
+
return None
|
444 |
+
except Exception as e:
|
445 |
+
logger.error(f"[Web Scrape Fallback 3] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True)
|
446 |
+
return None
|
447 |
+
# Handle RapidAPI specific errors if known, otherwise general errors
|
448 |
+
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
449 |
+
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
|
450 |
+
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
|
451 |
+
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None # API itself failed
|
452 |
+
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
|
453 |
+
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 3] Timeout connecting to {api_host} API for {url}"); return None
|
454 |
+
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 3] Request error connecting to {api_host} API for {url}: {e}"); return None
|
455 |
+
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
456 |
+
|
457 |
+
# --- Summarization Functions (Unchanged) ---
|
458 |
|
459 |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
460 |
"""Internal function to call Gemini API. Returns (summary, error_message)."""
|
|
|
540 |
|
541 |
if summary:
|
542 |
logger.info(f"[Gemini Primary] Success generating summary. Output len: {len(summary)}");
|
|
|
|
|
|
|
543 |
return summary.strip(), None
|
544 |
else:
|
545 |
finish_reason = response.candidates[0].finish_reason if response.candidates else 'N/A'
|
|
|
618 |
summary = message.get("content")
|
619 |
if summary:
|
620 |
logger.info(f"[OpenRouter Fallback] Success via OpenRouter. Output len: {len(summary)}")
|
|
|
|
|
621 |
return summary.strip(), None
|
622 |
else:
|
623 |
logger.warning(f"[OpenRouter Fallback] OpenRouter success but content empty. Resp: {data}")
|
|
|
708 |
return "Sorry, an unknown error occurred during summary generation after trying all available models."
|
709 |
|
710 |
|
711 |
+
# --- Main Processing Logic (MODIFIED) ---
|
712 |
+
|
713 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
714 |
+
"""Handles the entire process: fetching content (with fallbacks) and summarizing."""
|
715 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
716 |
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None
|
717 |
try: background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 ); bot = Bot(token=bot_token, request=background_request)
|
718 |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return
|
719 |
+
|
720 |
+
content: Optional[str] = None
|
721 |
+
user_feedback_message: Optional[str] = None
|
722 |
+
success: bool = False
|
723 |
+
status_message_id: Optional[int] = message_id_to_edit
|
724 |
+
message_to_delete_later_id : Optional[int] = None
|
725 |
+
|
726 |
try:
|
727 |
+
# --- 1. Initial User Feedback ---
|
728 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nThis might take a moment..."
|
729 |
if status_message_id:
|
730 |
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
|
|
734 |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN )
|
735 |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
|
736 |
else: raise RuntimeError("Failed to send status message after retries.")
|
737 |
+
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise # Don't proceed if we can't communicate
|
738 |
+
|
739 |
try:
|
740 |
+
# --- 2. Content Fetching ---
|
741 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
742 |
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
|
743 |
+
|
744 |
if is_youtube:
|
745 |
+
# --- YouTube Transcript Logic (Unchanged) ---
|
746 |
video_id = extract_youtube_id(url)
|
747 |
+
if video_id: content = await get_youtube_transcript(video_id, url) # Tries lib -> Supadata -> Apify
|
748 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
749 |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
750 |
else:
|
751 |
+
# --- Website Scraping Logic (with New Fallbacks) ---
|
752 |
+
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, _urltotext_key_exists, _rapidapi_key_exists
|
753 |
+
|
754 |
+
# Method 1: Primary Scrape (Direct Fetch + BS4)
|
755 |
content = await get_website_content(url)
|
756 |
+
|
757 |
+
# Method 2: Fallback 1 (urltotext.com)
|
758 |
if not content:
|
759 |
+
logger.warning(f"[Task {task_id}] Primary web scrape failed for {url}. Trying Fallback 1 (urltotext.com).")
|
760 |
+
if _urltotext_key_exists:
|
|
|
761 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
762 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
763 |
+
if not content: logger.warning(f"[Task {task_id}] Fallback 1 (urltotext.com) also failed.")
|
764 |
+
else: logger.warning("[Task {task_id}] Fallback 1 (urltotext.com) API key unavailable. Skipping.")
|
765 |
+
|
766 |
+
# Method 3: Fallback 2 (Scraper's Proxy via RapidAPI - NEW)
|
767 |
+
if not content:
|
768 |
+
logger.warning(f"[Task {task_id}] Fallbacks 1 failed. Trying Fallback 2 (Scraper's Proxy).")
|
769 |
+
if _rapidapi_key_exists:
|
770 |
+
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
771 |
+
content = await get_website_content_via_scrapers_proxy(url, RAPIDAPI_KEY)
|
772 |
+
if not content: logger.warning(f"[Task {task_id}] Fallback 2 (Scraper's Proxy) also failed.")
|
773 |
+
else: logger.warning("[Task {task_id}] Fallback 2 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
|
774 |
+
|
775 |
+
# Method 4: Fallback 3 (AI Web Scraper via RapidAPI - NEW)
|
776 |
+
if not content:
|
777 |
+
logger.warning(f"[Task {task_id}] Fallbacks 2 failed. Trying Fallback 3 (AI Web Scraper).")
|
778 |
+
if _rapidapi_key_exists:
|
779 |
+
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
780 |
+
content = await get_website_content_via_ai_web_scraper(url, RAPIDAPI_KEY)
|
781 |
+
if not content: logger.warning(f"[Task {task_id}] Fallback 3 (AI Web Scraper) also failed.")
|
782 |
+
else: logger.warning("[Task {task_id}] Fallback 3 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
|
783 |
+
|
784 |
+
# Final check for website content
|
785 |
+
if not content and not user_feedback_message:
|
786 |
+
user_feedback_message = "Sorry, I couldn't fetch content from that website using any available method (blocked/inaccessible/empty?)."
|
787 |
+
|
788 |
+
# --- 3. Summarization ---
|
789 |
if content:
|
790 |
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.")
|
791 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
792 |
+
final_summary = await generate_summary(content, summary_type) # Calls Gemini -> OpenRouter
|
793 |
+
|
794 |
+
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
795 |
+
user_feedback_message = final_summary # Use the error message from summarizer
|
796 |
+
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
797 |
else:
|
798 |
+
# Split long messages if needed
|
799 |
+
max_length = 4096
|
800 |
+
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
|
801 |
+
# Send summary parts (using ParseMode=None as requested by AI prompt instructions)
|
802 |
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=None, link_preview_options={'is_disabled': True} )
|
803 |
+
for part in summary_parts[1:]:
|
804 |
+
await asyncio.sleep(0.5) # Small delay between parts
|
805 |
+
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
|
806 |
+
success = True
|
807 |
+
logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
|
808 |
+
user_feedback_message = None # Clear any previous error message if summarization succeeded
|
809 |
+
|
810 |
+
# --- 4. Handle Final Failure Feedback ---
|
811 |
+
if user_feedback_message: # If any step failed and set a message
|
812 |
+
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
|
813 |
+
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
|
814 |
+
|
815 |
except Exception as e:
|
816 |
+
# Catch unexpected errors during the inner try block (fetching/summarizing)
|
817 |
+
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True)
|
818 |
+
user_feedback_message = "Oops! Something went really wrong during processing. Please try again later."
|
819 |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
|
820 |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.")
|
821 |
+
|
822 |
except Exception as outer_e:
|
823 |
+
# Catch errors in the outer setup (bot creation, status message sending)
|
824 |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True)
|
825 |
try:
|
826 |
+
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred. Could not start processing." )
|
827 |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.")
|
828 |
finally:
|
829 |
+
# --- 5. Cleanup ---
|
830 |
+
# Delete the "Processing..." or original button message
|
831 |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
|
832 |
if delete_target_id and bot:
|
833 |
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id); logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}")
|
834 |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}")
|
835 |
+
# Close the background bot's HTTP client
|
836 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
837 |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
|
838 |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
|
839 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
840 |
|
841 |
+
|
842 |
+
# --- Telegram Handlers (Unchanged) ---
|
843 |
+
|
844 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
845 |
user = update.effective_user; mention = user.mention_html()
|
846 |
if not user or not update.message: return
|
847 |
logger.info(f"User {user.id} used /start.")
|
848 |
+
await update.message.reply_html( f"👋 Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
|
849 |
|
850 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
851 |
user = update.effective_user
|
|
|
859 |
if not update.message or not update.message.text: return
|
860 |
url = update.message.text.strip(); user = update.effective_user
|
861 |
if not user: return
|
862 |
+
# Basic URL check
|
863 |
+
if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
|
864 |
+
logger.debug(f"Ignoring non-URL message from {user.id}")
|
865 |
+
# Optionally, send a message if you want to guide the user
|
866 |
+
# await update.message.reply_text("Please send a valid URL starting with http:// or https://")
|
867 |
+
return
|
868 |
+
|
869 |
logger.info(f"User {user.id} sent potential URL: {url}")
|
870 |
context.user_data['url_to_summarize'] = url; context.user_data['original_message_id'] = update.message.message_id
|
871 |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]]
|
872 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
873 |
+
await update.message.reply_text( f"Okay, I see this link:\n`{url}`\n\nHow would you like it summarised?", reply_markup=reply_markup, disable_web_page_preview=True, parse_mode=ParseMode.MARKDOWN ) # Changed 'summarized', added Markdown
|
874 |
|
875 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
876 |
query = update.callback_query
|
877 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
878 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
879 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}")
|
880 |
+
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) # Log ack errors
|
881 |
+
|
882 |
url = context.user_data.get('url_to_summarize'); message_id_to_edit = query.message.message_id
|
883 |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}")
|
884 |
+
|
885 |
if not url:
|
886 |
+
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Might be an old button.")
|
887 |
+
try:
|
888 |
+
await query.edit_message_text(text="Sorry, I couldn't find the URL associated with this request (it might be too old). Please send the link again.")
|
889 |
+
except BadRequest as e:
|
890 |
+
if "message is not modified" in str(e).lower(): pass # Ignore if already edited
|
891 |
+
else: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
892 |
except Exception as e:
|
893 |
logger.error(f"Failed edit 'URL not found' msg: {e}")
|
894 |
+
# Don't send a new message here, as it might confuse the user if they didn't interact recently
|
|
|
895 |
return
|
896 |
|
897 |
+
# Clear context *after* checking it's valid for this interaction
|
898 |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
|
899 |
|
900 |
+
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
901 |
if not TELEGRAM_TOKEN:
|
902 |
+
logger.critical("TG TOKEN missing in callback!")
|
903 |
+
try: await query.edit_message_text(text="❌ Bot configuration error. Cannot proceed.")
|
904 |
except Exception: pass
|
905 |
return
|
906 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
907 |
+
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid in callback!")
|
908 |
+
try: await query.edit_message_text(text="❌ AI configuration error: No summarization models available.")
|
909 |
except Exception: pass
|
910 |
return
|
911 |
elif not _gemini_primary_enabled:
|
|
|
915 |
logger.warning("Fallback AI (OpenRouter) is unavailable.")
|
916 |
# No need to inform user unless primary fails later
|
917 |
|
918 |
+
logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}")
|
919 |
+
# Schedule the background task
|
920 |
+
asyncio.create_task(
|
921 |
+
process_summary_task(
|
922 |
+
user_id=user.id,
|
923 |
+
chat_id=query.message.chat_id,
|
924 |
+
message_id_to_edit=message_id_to_edit,
|
925 |
+
url=url,
|
926 |
+
summary_type=summary_type,
|
927 |
+
bot_token=TELEGRAM_TOKEN
|
928 |
+
),
|
929 |
+
name=f"SummaryTask-{user.id}-{message_id_to_edit}"
|
930 |
+
)
|
931 |
|
932 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
933 |
+
"""Log Errors caused by Updates."""
|
934 |
+
# Ignore specific, known errors that are often harmless or handled elsewhere
|
935 |
+
ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter)
|
936 |
+
if isinstance(context.error, ignore_errors):
|
937 |
+
# More specific logging for potentially ignorable errors
|
938 |
+
if isinstance(context.error, BadRequest) and any(err in str(context.error).lower() for err in ["message is not modified", "query is too old", "message to edit not found"]):
|
939 |
+
logger.debug(f"Ignoring known BadRequest in error_handler: {context.error}")
|
940 |
+
return
|
941 |
+
elif isinstance(context.error, AttributeError) and "object has no attribute" in str(context.error):
|
942 |
+
logger.debug(f"Ignoring handled AttributeError in error_handler: {context.error}")
|
943 |
+
return
|
944 |
+
else:
|
945 |
+
# Log other potentially recoverable network/timeout errors as warnings
|
946 |
+
logger.warning(f"Handled networking/API error in error_handler: {context.error}")
|
947 |
+
return
|
948 |
+
|
949 |
+
# Log all other exceptions as errors
|
950 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
951 |
+
# Optionally, notify the user about unexpected errors, but be careful not to spam
|
952 |
+
# if update and isinstance(update, Update) and update.effective_chat:
|
953 |
+
# try: await context.bot.send_message(chat_id=update.effective_chat.id, text="An unexpected error occurred.")
|
954 |
+
# except Exception: logger.error("Failed to send error message to user.")
|
955 |
+
|
956 |
+
|
957 |
+
# --- Application Setup & Web Framework (Unchanged) ---
|
958 |
|
959 |
async def setup_bot_config() -> Application:
|
960 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
961 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
962 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
963 |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build()
|
964 |
+
# Command Handlers
|
965 |
+
application.add_handler(CommandHandler("start", start))
|
966 |
+
application.add_handler(CommandHandler("help", help_command))
|
967 |
+
# Message Handler (URLs)
|
968 |
+
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND & filters.Entity("url") | filters.Entity("text_link"), handle_potential_url))
|
969 |
+
# Callback Query Handler (Buttons)
|
970 |
+
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
971 |
+
# Error Handler
|
972 |
+
application.add_error_handler(error_handler)
|
973 |
+
logger.info("Telegram application handlers configured."); return application
|
974 |
|
975 |
@contextlib.asynccontextmanager
|
976 |
async def lifespan(app: Starlette):
|
|
|
985 |
try:
|
986 |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
|
987 |
else: logger.warning("Failed delete webhook (API returned False).")
|
988 |
+
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1) # Short delay before setting new one
|
989 |
+
|
990 |
+
# Setup Webhook using SPACE_HOST env var from Hugging Face
|
991 |
+
space_host = os.environ.get("SPACE_HOST")
|
992 |
+
webhook_path = "/webhook" # Must match the route below
|
993 |
+
full_webhook_url = None
|
994 |
if space_host:
|
995 |
+
protocol = "https" # HF Spaces use HTTPS
|
996 |
+
host = space_host.split('://')[-1] # Get the host part
|
997 |
+
full_webhook_url = f"{protocol}://{host.rstrip('/')}{webhook_path}"
|
998 |
+
|
999 |
if full_webhook_url:
|
1000 |
+
logger.info(f"Setting webhook to: {full_webhook_url}")
|
1001 |
+
set_webhook_args = {
|
1002 |
+
"url": full_webhook_url,
|
1003 |
+
"allowed_updates": Update.ALL_TYPES, # Or specify types like [Update.MESSAGE, Update.CALLBACK_QUERY]
|
1004 |
+
"drop_pending_updates": True
|
1005 |
+
}
|
1006 |
+
if WEBHOOK_SECRET:
|
1007 |
+
set_webhook_args["secret_token"] = WEBHOOK_SECRET
|
1008 |
+
logger.info("Webhook secret token is configured.")
|
1009 |
+
|
1010 |
+
await asyncio.sleep(1.0) # Give Telegram servers a moment after potential delete
|
1011 |
try:
|
1012 |
+
await ptb_app.bot.set_webhook(**set_webhook_args)
|
1013 |
+
webhook_info = await ptb_app.bot.get_webhook_info() # Verify
|
1014 |
+
if webhook_info.url == full_webhook_url:
|
1015 |
+
logger.info(f"Webhook successfully set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
|
1016 |
+
else:
|
1017 |
+
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got '{webhook_info.url}'. Check SPACE_HOST and path.")
|
1018 |
+
raise RuntimeError("Webhook URL mismatch after setting.")
|
1019 |
+
|
1020 |
+
await ptb_app.start() # Start the PTB application polling (for webhook updates)
|
1021 |
+
logger.info("PTB Application started in webhook mode.")
|
1022 |
+
|
1023 |
+
except Exception as e:
|
1024 |
+
logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True)
|
1025 |
+
raise RuntimeError(f"Failed to set webhook: {e}") from e
|
1026 |
+
else:
|
1027 |
+
logger.critical("Could not construct webhook URL from SPACE_HOST.")
|
1028 |
+
raise RuntimeError("Webhook URL could not be determined.")
|
1029 |
+
else:
|
1030 |
+
logger.critical("SPACE_HOST environment variable not found. Cannot set webhook automatically.")
|
1031 |
+
raise RuntimeError("SPACE_HOST environment variable is missing.")
|
1032 |
+
|
1033 |
+
logger.info("ASGI Lifespan: Startup complete."); yield # Application runs here
|
1034 |
+
|
1035 |
except Exception as startup_err:
|
1036 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
1037 |
if ptb_app:
|
1038 |
if ptb_app.running: await ptb_app.stop()
|
1039 |
await ptb_app.shutdown()
|
1040 |
+
raise # Propagate error to stop Starlette
|
1041 |
finally:
|
1042 |
+
# --- Shutdown ---
|
1043 |
logger.info("ASGI Lifespan: Shutdown initiated...")
|
1044 |
if ptb_app:
|
1045 |
+
if ptb_app.running: logger.info("Stopping PTB Application..."); await ptb_app.stop()
|
1046 |
+
logger.info("Shutting down PTB Application..."); await ptb_app.shutdown(); logger.info("PTB Application shut down.")
|
1047 |
+
# Attempt to clean up webhook on shutdown (optional, might fail if app is stopping forcefully)
|
1048 |
+
try:
|
1049 |
+
logger.info("Attempting to delete webhook on shutdown...")
|
1050 |
+
await ptb_app.bot.delete_webhook(drop_pending_updates=True)
|
1051 |
+
logger.info("Webhook deleted on shutdown.")
|
1052 |
+
except Exception as e:
|
1053 |
+
logger.warning(f"Could not delete webhook during shutdown: {e}")
|
1054 |
+
else: logger.info("PTB application was not fully initialized or failed during startup.")
|
1055 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1056 |
|
1057 |
async def health_check(request: Request) -> PlainTextResponse:
|
1058 |
+
"""Simple health check endpoint."""
|
1059 |
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1060 |
bot_status = "Not Initialized"
|
1061 |
if ptb_app and ptb_app.bot:
|
1062 |
try:
|
1063 |
+
if ptb_app.running:
|
1064 |
+
bot_info = await ptb_app.bot.get_me()
|
1065 |
+
bot_status = f"Running (@{bot_info.username})"
|
1066 |
+
else:
|
1067 |
+
bot_status = "Initialized but Not running"
|
1068 |
+
except Exception as e:
|
1069 |
+
bot_status = f"Error checking status: {e}"
|
1070 |
+
|
1071 |
+
return PlainTextResponse(
|
1072 |
+
f"TG Bot Summariser - Status: {bot_status}\n" # Changed 'Summarizer'
|
1073 |
+
f"Primary Model (Gemini): {GEMINI_MODEL if _gemini_primary_enabled else 'N/A (Disabled)'}\n"
|
1074 |
+
f"Fallback Model (OpenRouter): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'N/A (Disabled)'}\n"
|
1075 |
+
f"YT Fallback (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'N/A (No Token)'}\n"
|
1076 |
+
f"Web Fallback 1 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled'}\n"
|
1077 |
+
f"Web Fallbacks 2/3 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled'}"
|
1078 |
+
)
|
1079 |
|
1080 |
async def telegram_webhook(request: Request) -> Response:
|
1081 |
+
"""Handles incoming updates from Telegram."""
|
1082 |
global WEBHOOK_SECRET
|
1083 |
+
if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
|
1084 |
+
if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503)
|
1085 |
+
|
1086 |
+
# --- Security Check ---
|
1087 |
+
if WEBHOOK_SECRET:
|
1088 |
+
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
1089 |
+
if token_header != WEBHOOK_SECRET:
|
1090 |
+
logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'")
|
1091 |
+
return Response(content="Invalid secret token", status_code=403) # Forbidden
|
1092 |
+
|
1093 |
+
# --- Process Update ---
|
1094 |
try:
|
1095 |
+
update_data = await request.json()
|
1096 |
+
update = Update.de_json(data=update_data, bot=ptb_app.bot)
|
1097 |
+
logger.debug(f"Processing update_id: {update.update_id} via webhook")
|
1098 |
+
await ptb_app.process_update(update)
|
1099 |
+
return Response(status_code=200) # OK - Tell Telegram we received it
|
1100 |
+
except json.JSONDecodeError:
|
1101 |
+
logger.error("Webhook received invalid JSON.")
|
1102 |
+
return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1103 |
+
except Exception as e:
|
1104 |
+
logger.error(f"Error processing webhook update: {e}", exc_info=True)
|
1105 |
+
# Still return 200 OK to Telegram, otherwise it will keep retrying the same failed update.
|
1106 |
+
# The error is logged for debugging.
|
1107 |
+
return Response(status_code=200)
|
1108 |
+
|
1109 |
+
# --- Starlette App Definition ---
|
1110 |
+
app = Starlette(
|
1111 |
+
debug=False, # Set to True for more verbose errors during development ONLY
|
1112 |
+
lifespan=lifespan,
|
1113 |
+
routes=[
|
1114 |
+
Route("/", endpoint=health_check, methods=["GET"]),
|
1115 |
+
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), # Matches webhook_path
|
1116 |
+
]
|
1117 |
+
)
|
1118 |
logger.info("Starlette ASGI application created with native routes.")
|
1119 |
|
1120 |
+
# --- Development Server (if run directly) ---
|
1121 |
if __name__ == '__main__':
|
1122 |
import uvicorn
|
1123 |
+
logger.warning("Running in development mode using Uvicorn directly - NOT for production!")
|
1124 |
+
# Use environment variables for config if available, otherwise defaults
|
1125 |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
|
1126 |
+
local_port = int(os.environ.get('PORT', 8080)) # Use PORT env var common in PaaS
|
1127 |
+
uvicorn.run(
|
1128 |
+
"__main__:app",
|
1129 |
+
host='0.0.0.0', # Listen on all available network interfaces
|
1130 |
+
port=local_port,
|
1131 |
+
log_level=log_level,
|
1132 |
+
reload=True # Enable auto-reload for development
|
1133 |
+
)
|