Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,21 +1,22 @@
|
|
1 |
-
# main.py (Revised
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
5 |
import asyncio
|
6 |
import json
|
7 |
-
import html
|
8 |
-
import contextlib
|
9 |
-
import traceback
|
|
|
10 |
|
11 |
# --- Frameworks ---
|
12 |
-
from flask import Flask, request, Response
|
13 |
-
from starlette.applications import Starlette
|
14 |
-
from starlette.routing import Mount
|
15 |
-
from starlette.middleware.wsgi import WSGIMiddleware
|
16 |
|
17 |
# --- Telegram Bot ---
|
18 |
-
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
|
19 |
from telegram.ext import (
|
20 |
Application,
|
21 |
CommandHandler,
|
@@ -25,7 +26,7 @@ from telegram.ext import (
|
|
25 |
CallbackQueryHandler,
|
26 |
)
|
27 |
from telegram.constants import ParseMode
|
28 |
-
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest
|
29 |
from telegram.request import HTTPXRequest
|
30 |
|
31 |
# --- Other Libraries ---
|
@@ -33,6 +34,8 @@ import httpx
|
|
33 |
from youtube_transcript_api import YouTubeTranscriptApi
|
34 |
import requests
|
35 |
from bs4 import BeautifulSoup
|
|
|
|
|
36 |
_apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
|
37 |
if _apify_token_exists:
|
38 |
from apify_client import ApifyClient
|
@@ -55,8 +58,8 @@ logging.getLogger('starlette').setLevel(logging.INFO)
|
|
55 |
logger = logging.getLogger(__name__)
|
56 |
logger.info("Logging configured.")
|
57 |
|
58 |
-
# --- Global variable for PTB app
|
59 |
-
ptb_app: Application
|
60 |
|
61 |
# --- Environment Variable Loading ---
|
62 |
logger.info("Attempting to load secrets...")
|
@@ -73,14 +76,26 @@ SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
|
73 |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
74 |
logger.info("Secret loading attempt finished.")
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
-
# ---
|
78 |
-
# (Keep ALL your functions: is_youtube_url, extract_youtube_id,
|
79 |
-
# get_transcript_via_supadata, get_transcript_via_apify,
|
80 |
-
# get_youtube_transcript, get_website_content_via_requests,
|
81 |
-
# get_website_content_via_urltotext_api, generate_summary - unchanged)
|
82 |
-
|
83 |
-
# Helper Functions
|
84 |
def is_youtube_url(url):
|
85 |
"""Checks if the URL is a valid YouTube video or shorts URL."""
|
86 |
youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
|
@@ -100,608 +115,166 @@ def extract_youtube_id(url):
|
|
100 |
logger.warning(f"Could not extract YouTube ID from URL: {url}")
|
101 |
return None
|
102 |
|
103 |
-
#
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
if not api_key: logger.error("[Supadata] API key is missing."); return None
|
108 |
-
logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
|
109 |
-
api_endpoint = f"https://api.supadata.net/v1/youtube/transcript"
|
110 |
-
params = {"videoId": video_id, "format": "text"}
|
111 |
-
headers = {"X-API-Key": api_key}
|
112 |
-
try:
|
113 |
-
logger.warning("[Supadata] Making request with verify=False (Attempting to bypass SSL verification - Potential Security Risk)")
|
114 |
-
response = await asyncio.to_thread(requests.get, api_endpoint, headers=headers, params=params, timeout=30, verify=False)
|
115 |
-
logger.debug(f"[Supadata] Received status code {response.status_code} for {video_id}")
|
116 |
-
if response.status_code == 200:
|
117 |
-
try:
|
118 |
-
data = response.json()
|
119 |
-
content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
120 |
-
if content and isinstance(content, str):
|
121 |
-
logger.info(f"[Supadata] Successfully fetched transcript for {video_id}. Length: {len(content)}")
|
122 |
-
return content.strip()
|
123 |
-
else:
|
124 |
-
logger.warning(f"[Supadata] API success but content empty/invalid for {video_id}. Response: {data}")
|
125 |
-
return None
|
126 |
-
except json.JSONDecodeError:
|
127 |
-
if response.text:
|
128 |
-
logger.info(f"[Supadata] Successfully fetched transcript (plain text) for {video_id}. Length: {len(response.text)}")
|
129 |
-
return response.text.strip()
|
130 |
-
else:
|
131 |
-
logger.error(f"[Supadata] Failed to decode JSON response (and no text body) for {video_id}. Response: {response.text[:200]}...")
|
132 |
-
return None
|
133 |
-
except Exception as e:
|
134 |
-
logger.error(f"[Supadata] Error processing successful response for {video_id}: {e}", exc_info=True)
|
135 |
-
return None
|
136 |
-
elif response.status_code in [401, 403]:
|
137 |
-
logger.error(f"[Supadata] Authentication error ({response.status_code}). Check API key.")
|
138 |
-
return None
|
139 |
-
elif response.status_code == 404:
|
140 |
-
logger.warning(f"[Supadata] Transcript not found ({response.status_code}) for {video_id}.")
|
141 |
-
return None
|
142 |
-
else:
|
143 |
-
logger.error(f"[Supadata] Unexpected status code {response.status_code} for {video_id}. Response: {response.text[:200]}...")
|
144 |
-
return None
|
145 |
-
except requests.exceptions.Timeout:
|
146 |
-
logger.error(f"[Supadata] Timeout error connecting to API for {video_id}")
|
147 |
-
return None
|
148 |
-
except requests.exceptions.RequestException as e:
|
149 |
-
logger.error(f"[Supadata] Request error connecting to API for {video_id}: {e}")
|
150 |
-
if isinstance(e, requests.exceptions.SSLError):
|
151 |
-
logger.error(f"[Supadata] SSL Error occurred despite using verify=False. Details: {e}")
|
152 |
-
return None
|
153 |
-
except Exception as e:
|
154 |
-
logger.error(f"[Supadata] Unexpected error during API call for {video_id}: {e}", exc_info=True)
|
155 |
-
return None
|
156 |
-
|
157 |
-
# Apify Transcript Fetching (with fixed fallback parsing)
|
158 |
-
async def get_transcript_via_apify(video_url: str, api_token: str):
|
159 |
-
"""Fetches YouTube transcript via Apify API."""
|
160 |
-
if not video_url: logger.error("[Apify] get_transcript_via_apify called with no video_url"); return None
|
161 |
-
if not api_token: logger.error("[Apify] API token is missing."); return None
|
162 |
-
if not ApifyClient: logger.error("[Apify] ApifyClient not available/imported."); return None
|
163 |
-
|
164 |
-
logger.info(f"[Apify] Attempting fetch for URL: {video_url}")
|
165 |
-
actor_id = "karamelo~youtube-transcripts"
|
166 |
-
api_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
167 |
-
params = {"token": api_token}
|
168 |
-
payload = json.dumps({
|
169 |
-
"urls": [video_url],
|
170 |
-
"outputFormat": "singleStringText",
|
171 |
-
"maxRetries": 3,
|
172 |
-
"channelHandleBoolean": False,
|
173 |
-
"channelNameBoolean": False,
|
174 |
-
"datePublishedBoolean": False,
|
175 |
-
"relativeDateTextBoolean": False,
|
176 |
-
})
|
177 |
-
headers = {"Content-Type": "application/json"}
|
178 |
-
try:
|
179 |
-
logger.debug(f"[Apify] Sending request to run actor {actor_id} synchronously for {video_url}")
|
180 |
-
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, params=params, data=payload, timeout=90)
|
181 |
-
logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
|
182 |
-
|
183 |
-
if response.status_code in [200, 201]:
|
184 |
-
try:
|
185 |
-
results = response.json()
|
186 |
-
if isinstance(results, list) and len(results) > 0:
|
187 |
-
item = results[0]
|
188 |
-
content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
|
189 |
-
|
190 |
-
if not content and item.get("captions"):
|
191 |
-
captions_data = item["captions"]
|
192 |
-
if isinstance(captions_data, str):
|
193 |
-
logger.info("[Apify] Processing 'captions' string format as fallback.")
|
194 |
-
content = captions_data.strip()
|
195 |
-
if len(content) < 50 and "error" in content.lower():
|
196 |
-
logger.warning(f"[Apify] 'captions' string looks like an error: {content}")
|
197 |
-
content = None
|
198 |
-
elif isinstance(captions_data, list):
|
199 |
-
logger.info("[Apify] Processing 'captions' list format as fallback.")
|
200 |
-
texts = [cap.get("text", "") for cap in captions_data if isinstance(cap, dict) and cap.get("text")]
|
201 |
-
content = " ".join(texts).strip()
|
202 |
-
else:
|
203 |
-
logger.warning(f"[Apify] 'captions' field found but is neither string nor list: {type(captions_data)}")
|
204 |
-
content = None
|
205 |
-
|
206 |
-
if content:
|
207 |
-
try:
|
208 |
-
content = html.unescape(content) # Use imported html module
|
209 |
-
except Exception as unescape_err:
|
210 |
-
logger.warning(f"[Apify] Error during html unescaping: {unescape_err}")
|
211 |
-
|
212 |
-
if content and isinstance(content, str):
|
213 |
-
logger.info(f"[Apify] Successfully fetched transcript via run-sync for {video_url} (Status: {response.status_code}). Length: {len(content)}")
|
214 |
-
return content
|
215 |
-
else:
|
216 |
-
if item.get("text") or item.get("transcript") or item.get("captions_concatenated"): logger.warning(f"[Apify] Actor success ({response.status_code}) but primary fields empty for {video_url}.")
|
217 |
-
elif not item.get("captions"): logger.warning(f"[Apify] Actor success ({response.status_code}) but no relevant fields found for {video_url}. Item: {item}")
|
218 |
-
else: logger.warning(f"[Apify] Actor success ({response.status_code}), 'captions' found but fallback parsing failed for {video_url}.")
|
219 |
-
return None
|
220 |
-
else:
|
221 |
-
logger.warning(f"[Apify] Actor success ({response.status_code}) but dataset result list empty for {video_url}. Response: {results}")
|
222 |
-
return None
|
223 |
-
except json.JSONDecodeError:
|
224 |
-
logger.error(f"[Apify] Failed JSON decode for {video_url}. Status: {response.status_code}. Resp: {response.text[:200]}...")
|
225 |
-
return None
|
226 |
-
except Exception as e:
|
227 |
-
logger.error(f"[Apify] Error processing successful response ({response.status_code}) for {video_url}: {e}", exc_info=True)
|
228 |
-
return None
|
229 |
-
elif response.status_code == 400: logger.error(f"[Apify] Bad Request (400) for {video_url}. Resp: {response.text[:200]}..."); return None
|
230 |
-
elif response.status_code == 401: logger.error("[Apify] Auth error (401). Check token."); return None
|
231 |
-
else: logger.error(f"[Apify] Unexpected status {response.status_code} for {video_url}. Resp: {response.text[:200]}..."); return None
|
232 |
-
|
233 |
-
except requests.exceptions.Timeout: logger.error(f"[Apify] Timeout error running actor for {video_url}"); return None
|
234 |
-
except requests.exceptions.RequestException as e: logger.error(f"[Apify] Request error running actor for {video_url}: {e}"); return None
|
235 |
-
except Exception as e: logger.error(f"[Apify] Unexpected error during Apify call for {video_url}: {e}", exc_info=True); return None
|
236 |
-
|
237 |
-
# Combined YouTube Transcript Function
|
238 |
-
async def get_youtube_transcript(video_id: str, video_url: str, supadata_key: str | None, apify_token: str | None):
|
239 |
-
"""Fetches YouTube transcript using library, then Supadata, then Apify."""
|
240 |
-
if not video_id: logger.error("get_youtube_transcript called with no video_id"); return None
|
241 |
-
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
242 |
-
transcript_text = None
|
243 |
-
try: # Primary: Library
|
244 |
-
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
245 |
-
transcript_list = await asyncio.to_thread(YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'])
|
246 |
-
if transcript_list:
|
247 |
-
transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
|
248 |
-
transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
|
249 |
-
if transcript_text: logger.info(f"[Primary YT] Success via library. Length: {len(transcript_text)}"); return transcript_text
|
250 |
-
else: logger.warning("[Primary YT] Joined text empty after cleaning."); transcript_text = None
|
251 |
-
else: logger.warning("[Primary YT] Transcript list empty."); transcript_text = None
|
252 |
-
except Exception as e:
|
253 |
-
logger.warning(f"[Primary YT] Error via library: {type(e).__name__} - {e}")
|
254 |
-
if "YouTube is blocking requests" in str(e) or "HTTP Error 429" in str(e): logger.warning("[Primary YT] IP likely blocked.")
|
255 |
-
elif "No transcript found" in str(e): logger.warning("[Primary YT] No transcript in specified languages.")
|
256 |
-
elif "TranscriptsDisabled" in str(e) or "disabled" in str(e): logger.warning("[Primary YT] Transcripts disabled for this video.")
|
257 |
-
transcript_text = None # Ensure it's None on error
|
258 |
-
|
259 |
-
if transcript_text is None: # Fallback 1: Supadata
|
260 |
-
logger.info("[Fallback YT 1] Trying Supadata API...")
|
261 |
-
if supadata_key:
|
262 |
-
transcript_text = await get_transcript_via_supadata(video_id, supadata_key)
|
263 |
-
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata. Length: {len(transcript_text)}"); return transcript_text
|
264 |
-
else: logger.warning("[Fallback YT 1] Supadata failed or no content found.")
|
265 |
-
else: logger.warning("[Fallback YT 1] Supadata key not available, skipping.")
|
266 |
-
|
267 |
-
if transcript_text is None: # Fallback 2: Apify
|
268 |
-
logger.info("[Fallback YT 2] Trying Apify API...")
|
269 |
-
if apify_token:
|
270 |
-
transcript_text = await get_transcript_via_apify(video_url, apify_token)
|
271 |
-
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify. Length: {len(transcript_text)}"); return transcript_text
|
272 |
-
else: logger.warning("[Fallback YT 2] Apify failed or no content found.")
|
273 |
-
else: logger.warning("[Fallback YT 2] Apify token not available, skipping.")
|
274 |
-
|
275 |
-
if transcript_text is None: logger.error(f"All methods failed to fetch transcript for video ID: {video_id}")
|
276 |
-
return transcript_text
|
277 |
-
|
278 |
-
# Website Content via Requests/BS4
|
279 |
-
async def get_website_content_via_requests(url):
|
280 |
-
"""Attempts to scrape website content using requests/BeautifulSoup."""
|
281 |
-
if not url: logger.error("[Web Scraper - Requests/BS4] get_website_content_via_requests called with no URL"); return None
|
282 |
-
logger.info(f"[Web Scraper - Requests/BS4] Attempting fetch: {url}")
|
283 |
-
try:
|
284 |
-
headers = {
|
285 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
286 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
287 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
288 |
-
'Connection': 'keep-alive',
|
289 |
-
'DNT': '1',
|
290 |
-
'Upgrade-Insecure-Requests': '1'
|
291 |
-
}
|
292 |
-
response = await asyncio.to_thread(requests.get, url, headers=headers, timeout=25, allow_redirects=True)
|
293 |
-
response.raise_for_status()
|
294 |
-
logger.debug(f"[Web Scraper - Requests/BS4] Status {response.status_code} for {url}")
|
295 |
-
|
296 |
-
content_type = response.headers.get('content-type', '').lower()
|
297 |
-
if 'html' not in content_type:
|
298 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Non-HTML content type received: {content_type}. Attempting plain text extraction.")
|
299 |
-
if 'text/plain' in content_type and response.text:
|
300 |
-
logger.info(f"[Web Scraper - Requests/BS4] Extracted plain text content. Length: {len(response.text.strip())}")
|
301 |
-
return response.text.strip()
|
302 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Content type '{content_type}' not suitable for parsing. Aborting.")
|
303 |
-
return None
|
304 |
-
|
305 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
306 |
-
tags_to_remove = ["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "label", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio", "picture", "source"]
|
307 |
-
selectors_to_remove = ['.ad', '#ad', '.ads', '#ads', '.advertisement', '#advertisement', '.banner', '#banner', '.menu', '#menu', '.navigation', '#navigation', '.sidebar', '#sidebar', '.social', '#social', '.share', '#share', '.related', '#related', '.comments', '#comments', '.cookie-consent', '#cookie-consent']
|
308 |
-
for tag in soup(tags_to_remove): tag.decompose()
|
309 |
-
for selector in selectors_to_remove:
|
310 |
-
for element in soup.select(selector): element.decompose()
|
311 |
-
|
312 |
-
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
|
313 |
-
target_element = main_content if main_content else soup.body
|
314 |
-
if not target_element:
|
315 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Could not find a suitable target element (main, article, body) for {url}");
|
316 |
-
return None
|
317 |
-
|
318 |
-
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
319 |
-
text = "\n\n".join(lines)
|
320 |
-
|
321 |
-
MIN_TEXT_LENGTH = 100
|
322 |
-
if not text or len(text) < MIN_TEXT_LENGTH:
|
323 |
-
logger.warning(f"[Web Scraper - Requests/BS4] Extracted text is too short (<{MIN_TEXT_LENGTH} chars) after cleaning for {url}. Length: {len(text)}. Content might be JS-rendered or blocked.")
|
324 |
-
return None
|
325 |
-
|
326 |
-
logger.info(f"[Web Scraper - Requests/BS4] Successfully scraped and cleaned content from {url}. Final Length: {len(text)}")
|
327 |
-
return text
|
328 |
-
|
329 |
-
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - Requests/BS4] Timeout error fetching {url}"); return None
|
330 |
-
except requests.exceptions.TooManyRedirects: logger.error(f"[Web Scraper - Requests/BS4] Too many redirects error for {url}"); return None
|
331 |
-
except requests.exceptions.HTTPError as e: logger.error(f"[Web Scraper - Requests/BS4] HTTP error {e.response.status_code} for {url}"); return None
|
332 |
-
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - Requests/BS4] General request error for {url}: {e}"); return None
|
333 |
-
except Exception as e: logger.error(f"[Web Scraper - Requests/BS4] Error during parsing or processing {url}: {e}", exc_info=True); return None
|
334 |
-
|
335 |
-
# Website Content via URLToText API
|
336 |
-
async def get_website_content_via_urltotext_api(url: str, api_key: str):
|
337 |
-
"""Fetches website content using the URLToText API."""
|
338 |
-
if not url: logger.error("[Web Scraper - URLToText API] get_website_content_via_urltotext_api called with no URL"); return None
|
339 |
-
if not api_key: logger.error("[Web Scraper - URLToText API] API key is missing."); return None
|
340 |
-
logger.info(f"[Web Scraper - URLToText API] Attempting fetch via API: {url}")
|
341 |
-
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
342 |
-
payload = json.dumps({
|
343 |
-
"url": url,
|
344 |
-
"output_format": "text",
|
345 |
-
"extract_main_content": True,
|
346 |
-
"render_javascript": True,
|
347 |
-
"residential_proxy": False,
|
348 |
-
"timeout_render": 20000,
|
349 |
-
})
|
350 |
-
headers = {"Authorization": f"Token {api_key}", "Content-Type": "application/json"}
|
351 |
-
try:
|
352 |
-
response = await asyncio.to_thread(requests.post, api_endpoint, headers=headers, data=payload, timeout=60)
|
353 |
-
logger.debug(f"[Web Scraper - URLToText API] Received status code {response.status_code} for {url}")
|
354 |
-
if response.status_code == 200:
|
355 |
-
try:
|
356 |
-
data = response.json()
|
357 |
-
content_data = data.get("data", {})
|
358 |
-
content = content_data.get("content")
|
359 |
-
credits = data.get("credits_used", "N/A")
|
360 |
-
warning = content_data.get("warning")
|
361 |
-
error_msg = content_data.get("error")
|
362 |
-
|
363 |
-
if warning: logger.warning(f"[Web Scraper - URLToText API] API Warning for {url}: {warning}")
|
364 |
-
if error_msg: logger.error(f"[Web Scraper - URLToText API] API Error reported for {url}: {error_msg}"); return None
|
365 |
-
|
366 |
-
if content and isinstance(content, str):
|
367 |
-
logger.info(f"[Web Scraper - URLToText API] Successfully fetched content via API. Length: {len(content.strip())}. Credits Used: {credits}");
|
368 |
-
return content.strip()
|
369 |
-
else:
|
370 |
-
logger.warning(f"[Web Scraper - URLToText API] API returned status 200 but content is empty or invalid for {url}. Response: {data}");
|
371 |
-
return None
|
372 |
-
except json.JSONDecodeError: logger.error(f"[Web Scraper - URLToText API] Failed to decode JSON response from API. Status: {response.status_code}. Response Text: {response.text[:500]}..."); return None
|
373 |
-
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Error processing successful API response: {e}", exc_info=True); return None
|
374 |
-
elif response.status_code == 400: logger.error(f"[Web Scraper - URLToText API] Bad Request (400) to API. Check payload/URL. Response: {response.text[:200]}...")
|
375 |
-
elif response.status_code == 401: logger.error(f"[Web Scraper - URLToText API] Unauthorized (401). Check API Key. Response: {response.text[:200]}...")
|
376 |
-
elif response.status_code == 402: logger.error(f"[Web Scraper - URLToText API] Payment Required (402). Check API credits/plan. Response: {response.text[:200]}...")
|
377 |
-
elif response.status_code == 422: logger.warning(f"[Web Scraper - URLToText API] Unprocessable URL / Fetch Error (422) reported by API for {url}. Response: {response.text[:200]}...")
|
378 |
-
elif response.status_code == 429: logger.warning(f"[Web Scraper - URLToText API] Rate Limit Hit (429). Response: {response.text[:200]}...")
|
379 |
-
elif response.status_code >= 500: logger.error(f"[Web Scraper - URLToText API] API Server Error ({response.status_code}). Response: {response.text[:200]}...")
|
380 |
-
else: logger.error(f"[Web Scraper - URLToText API] Unexpected status code {response.status_code} from API. Response: {response.text[:200]}...")
|
381 |
-
return None
|
382 |
-
except requests.exceptions.Timeout: logger.error(f"[Web Scraper - URLToText API] Timeout connecting to API for {url}"); return None
|
383 |
-
except requests.exceptions.RequestException as e: logger.error(f"[Web Scraper - URLToText API] Request error connecting to API: {e}"); return None
|
384 |
-
except Exception as e: logger.error(f"[Web Scraper - URLToText API] Unexpected error during API call: {e}", exc_info=True); return None
|
385 |
-
|
386 |
-
# DeepSeek Summary Function (with updated prompts)
|
387 |
-
async def generate_summary(text: str, summary_type: str, api_key: str) -> str:
|
388 |
-
"""Generates summary using DeepSeek via OpenRouter API."""
|
389 |
-
logger.info(f"Generating '{summary_type}' summary. Input length: {len(text)}")
|
390 |
-
if not api_key: logger.error("OpenRouter API key missing."); return "Error: AI service configuration key is missing."
|
391 |
-
if not text or not text.strip(): logger.warning("generate_summary called with empty or whitespace-only text."); return "Error: No content was provided to summarize."
|
392 |
-
|
393 |
-
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
394 |
-
model_name = "deepseek/deepseek-chat:free"
|
395 |
-
|
396 |
-
if summary_type == "paragraph":
|
397 |
-
system_message = (
|
398 |
-
"You are an expert summarization AI. Your goal is to provide a concise, easy-to-understand summary of the provided text. "
|
399 |
-
"Follow these instructions precisely:\n"
|
400 |
-
"1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
|
401 |
-
"2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
|
402 |
-
"3. **Format:** Output a single paragraph only.\n"
|
403 |
-
"4. **Conciseness:** The summary must be **no more than 85 words** long.\n"
|
404 |
-
"5. **Completeness:** Cover the main points from the entire text, not just the start.\n"
|
405 |
-
"6. **Punctuation:** Do NOT use em dashes (– or —). Use semicolons (;) if needed for complex sentence structure, but prefer simpler sentences.\n"
|
406 |
-
"7. **Tone:** Maintain a neutral and informative tone.\n"
|
407 |
-
"8. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
|
408 |
-
)
|
409 |
-
user_prompt_instruction = "Summarize the following text into a single paragraph adhering strictly to the rules outlined in the system message:"
|
410 |
-
|
411 |
-
elif summary_type == "points":
|
412 |
-
system_message = (
|
413 |
-
"You are an expert summarization AI. Your goal is to extract the key points from the provided text and present them as a bulleted list. "
|
414 |
-
"Follow these instructions precisely:\n"
|
415 |
-
"1. **Language and Spelling:** Use simple British English. Ensure all spellings conform to British English (e.g., 'summarise', 'centre', 'realise').\n"
|
416 |
-
"2. **Clarity:** Write clearly so someone unfamiliar with the topic can understand.\n"
|
417 |
-
"3. **Format:** Output as a bulleted list. Start each point with a standard bullet character ('*' or '-'). Each point should be distinct and on a new line.\n"
|
418 |
-
"4. **Content:** Each bullet point should represent a single key finding, main topic, or significant piece of information from the text.\n"
|
419 |
-
"5. **Conciseness:** Keep each bullet point brief and to the point.\n"
|
420 |
-
"6. **Completeness:** Cover the main points from the entire text, not just the start.\n"
|
421 |
-
"7. **Punctuation:** Do NOT use em dashes (– or —) within bullet points.\n"
|
422 |
-
"8. **Tone:** Maintain a neutral and informative tone.\n"
|
423 |
-
"9. **Focus:** Extract factual information and key topics. Do not add opinions or information not present in the text."
|
424 |
-
)
|
425 |
-
user_prompt_instruction = "Summarize the following text into a bulleted list adhering strictly to the rules outlined in the system message:"
|
426 |
-
else:
|
427 |
-
logger.error(f"Invalid summary_type '{summary_type}' requested.")
|
428 |
-
return f"Error: Invalid summary type ('{summary_type}') requested. Please choose 'paragraph' or 'points'."
|
429 |
-
|
430 |
-
MAX_INPUT_TOKENS_ESTIMATE = 28000
|
431 |
-
AVG_CHARS_PER_TOKEN = 4
|
432 |
-
MAX_INPUT_LENGTH = MAX_INPUT_TOKENS_ESTIMATE * AVG_CHARS_PER_TOKEN
|
433 |
-
|
434 |
-
if len(text) > MAX_INPUT_LENGTH:
|
435 |
-
logger.warning(f"Input text length ({len(text)} chars) exceeds estimated limit ({MAX_INPUT_LENGTH}). Truncating.")
|
436 |
-
truncation_marker = "\n\n[... Text truncated due to length ...]"
|
437 |
-
text = text[:MAX_INPUT_LENGTH - len(truncation_marker)] + truncation_marker
|
438 |
-
|
439 |
-
messages = [
|
440 |
-
{"role": "system", "content": system_message},
|
441 |
-
{"role": "user", "content": f"{user_prompt_instruction}\n\n--- TEXT TO SUMMARIZE ---\n\n{text}\n\n--- END OF TEXT ---"}
|
442 |
-
]
|
443 |
-
|
444 |
-
space_host = os.environ.get("SPACE_HOST", "huggingface.co/spaces/YOUR_SPACE_NAME")
|
445 |
-
referer_url = f"https://{space_host}" if space_host and not space_host.startswith("http") else space_host or "https://huggingface.co"
|
446 |
-
headers = {
|
447 |
-
"Authorization": f"Bearer {api_key}",
|
448 |
-
"Content-Type": "application/json",
|
449 |
-
"HTTP-Referer": referer_url,
|
450 |
-
"X-Title": "Telegram URL Summarizer Bot"
|
451 |
-
}
|
452 |
-
payload = json.dumps({"model": model_name, "messages": messages})
|
453 |
-
|
454 |
-
try:
|
455 |
-
logger.debug(f"Sending request to OpenRouter (Model: {model_name}). Prompt length approx: {len(text)} chars.")
|
456 |
-
response = await asyncio.to_thread(requests.post, openrouter_api_endpoint, headers=headers, data=payload, timeout=120)
|
457 |
-
logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
458 |
-
|
459 |
-
if response.status_code == 200:
|
460 |
-
try:
|
461 |
-
data = response.json()
|
462 |
-
choice = data.get("choices", [{}])[0]
|
463 |
-
message = choice.get("message", {})
|
464 |
-
summary = message.get("content")
|
465 |
-
finish_reason = choice.get("finish_reason")
|
466 |
-
|
467 |
-
if summary and isinstance(summary, str) and summary.strip():
|
468 |
-
summary = summary.strip()
|
469 |
-
logger.info(f"Successfully generated summary. Finish Reason: {finish_reason}. Length: {len(summary)}")
|
470 |
-
if summary_type == "paragraph" and len(summary.split()) > 95:
|
471 |
-
logger.warning(f"Generated paragraph summary slightly longer than target word count ({len(summary.split())} words).")
|
472 |
-
return summary
|
473 |
-
else:
|
474 |
-
logger.warning(f"OpenRouter returned status 200 but summary content is missing or empty. Response data: {data}")
|
475 |
-
return "Sorry, the AI model returned an empty summary. The content might have been unsuitable."
|
476 |
-
|
477 |
-
except (json.JSONDecodeError, IndexError, KeyError, AttributeError) as e:
|
478 |
-
logger.error(f"Failed to parse successful (200) response from OpenRouter. Error: {e}. Response Text: {response.text[:500]}...", exc_info=True)
|
479 |
-
return "Sorry, there was an issue parsing the response from the AI service."
|
480 |
-
except Exception as e:
|
481 |
-
logger.error(f"Unexpected error processing OpenRouter success response: {e}", exc_info=True)
|
482 |
-
return "Sorry, an unexpected error occurred while processing the AI response."
|
483 |
-
|
484 |
-
elif response.status_code == 401: logger.error("OpenRouter API key is invalid (Unauthorized - 401)."); return "Error: AI service authentication failed. Please check the configuration."
|
485 |
-
elif response.status_code == 402: logger.error("OpenRouter Payment Required (402). Check credits/limits."); return "Sorry, there's an issue with the AI service account limits or payment."
|
486 |
-
elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Hit (429)."); return "Sorry, the AI model is currently busy due to high demand. Please try again in a moment."
|
487 |
-
elif response.status_code == 400: logger.error(f"OpenRouter Bad Request (400). Likely prompt issue. Response: {response.text[:500]}..."); return "Sorry, the request to the AI service was invalid (possibly due to the content or prompt)."
|
488 |
-
elif response.status_code >= 500: logger.error(f"OpenRouter Server Error ({response.status_code}). Response: {response.text[:500]}..."); return "Sorry, the AI service is experiencing internal issues. Please try again later."
|
489 |
-
else:
|
490 |
-
logger.error(f"Unexpected HTTP status {response.status_code} from OpenRouter. Response: {response.text[:500]}...")
|
491 |
-
try:
|
492 |
-
error_data = response.json()
|
493 |
-
error_msg = error_data.get("error", {}).get("message", response.text[:100])
|
494 |
-
return f"Sorry, the AI service returned an error ({response.status_code}): {error_msg}"
|
495 |
-
except json.JSONDecodeError:
|
496 |
-
return f"Sorry, the AI service returned an unexpected error (Status: {response.status_code})."
|
497 |
-
|
498 |
-
except requests.exceptions.Timeout: logger.error("Timeout connecting to OpenRouter API."); return "Sorry, the request to the AI model timed out. Please try again."
|
499 |
-
except requests.exceptions.RequestException as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, there was a network error connecting to the AI model service."
|
500 |
-
except Exception as e: logger.error(f"Unexpected error occurred within generate_summary function: {e}", exc_info=True); return "Sorry, an unexpected internal error occurred while generating the summary."
|
501 |
-
|
502 |
-
|
503 |
-
# --- Background Task Processing ---
|
504 |
|
|
|
505 |
async def process_summary_task(
|
506 |
user_id: int,
|
507 |
chat_id: int,
|
508 |
message_id_to_edit: int,
|
509 |
url: str,
|
510 |
summary_type: str,
|
511 |
-
|
512 |
) -> None:
|
513 |
"""Handles the actual fetching and summarization in a background task."""
|
514 |
-
task_id =
|
515 |
-
logger.info(f"[Task {task_id}] Starting processing for
|
516 |
-
|
517 |
-
#
|
518 |
-
|
519 |
-
|
520 |
-
current_supadata_key = os.environ.get('SUPADATA_API_KEY')
|
521 |
-
current_apify_token = os.environ.get('APIFY_API_TOKEN')
|
522 |
-
# Keys check (already done in handler, but good for task log context)
|
523 |
-
keys_present = f"OR={'Y' if current_openrouter_key else 'N'}, UTT={'Y' if current_urltotext_key else 'N'}, SD={'Y' if current_supadata_key else 'N'}, AP={'Y' if current_apify_token else 'N'}"
|
524 |
-
logger.debug(f"[Task {task_id}] API Key check: {keys_present}")
|
525 |
-
|
526 |
-
if not current_openrouter_key:
|
527 |
-
logger.error(f"[Task {task_id}] CRITICAL: OpenRouter API key is missing. Cannot generate summary.")
|
528 |
-
try:
|
529 |
-
# Edit the original message to show the config error
|
530 |
-
await bot.edit_message_text(
|
531 |
-
chat_id=chat_id,
|
532 |
-
message_id=message_id_to_edit,
|
533 |
-
text="❌ Configuration Error: The AI summarization service is not configured correctly. Please contact the administrator."
|
534 |
-
)
|
535 |
-
except Exception as edit_err:
|
536 |
-
logger.error(f"[Task {task_id}] Failed to edit message for missing AI key: {edit_err}")
|
537 |
-
return # Stop task
|
538 |
-
|
539 |
-
# --- Inform User Processing Has Started ---
|
540 |
-
processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
|
541 |
-
status_message_sent_id = None # Track if we sent a separate message
|
542 |
-
|
543 |
try:
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
content = None
|
568 |
-
user_feedback_message = None
|
569 |
-
success = False
|
570 |
|
571 |
-
|
572 |
-
# Send 'typing' action to indicate activity
|
573 |
-
try:
|
574 |
-
logger.debug(f"[Task {task_id}] Sending 'typing' chat action to chat {chat_id}")
|
575 |
-
await bot.send_chat_action(chat_id=chat_id, action='typing')
|
576 |
-
except Exception as ca_err:
|
577 |
-
logger.warning(f"[Task {task_id}] Failed sending 'typing' action: {ca_err}")
|
578 |
|
579 |
-
# ---
|
580 |
-
|
581 |
-
|
|
|
582 |
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
else:
|
594 |
-
logger.
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
|
|
|
|
|
|
600 |
if content:
|
601 |
-
logger.info(f"[Task {task_id}]
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
try: await bot.send_chat_action(chat_id=chat_id, action='typing'); logger.debug("[Task {task_id}] Sent typing before fallback scrape.")
|
606 |
-
except: pass
|
607 |
|
608 |
-
|
609 |
-
|
610 |
-
if content:
|
611 |
-
logger.info(f"[Task {task_id}] Website scrape successful via URLToText API. Length: {len(content)}")
|
612 |
-
else:
|
613 |
-
logger.warning(f"[Task {task_id}] Fallback website scrape (URLToText API) also failed for {url}.")
|
614 |
-
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website using available methods. It might be protected or structured in a way I can't parse."
|
615 |
else:
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
if summary.startswith("Error:") or summary.startswith("Sorry,"):
|
628 |
-
logger.warning(f"[Task {task_id}] AI summary generation failed. Reason: {summary}")
|
629 |
-
user_feedback_message = f"⚠️ {summary}"
|
630 |
-
else:
|
631 |
-
# --- Summary Success - Send to User ---
|
632 |
-
logger.info(f"[Task {task_id}] Summary generated successfully. Length: {len(summary)}. Sending result.")
|
633 |
-
try:
|
634 |
-
await bot.send_message(
|
635 |
-
chat_id=chat_id,
|
636 |
-
text=summary,
|
637 |
-
parse_mode=ParseMode.MARKDOWN,
|
638 |
-
link_preview_options={'is_disabled': True}
|
639 |
-
)
|
640 |
success = True
|
641 |
-
user_feedback_message = None
|
642 |
-
logger.info(f"[Task {task_id}] Successfully sent summary to chat {chat_id}.")
|
643 |
-
except Exception as send_final_err:
|
644 |
-
logger.error(f"[Task {task_id}] Failed sending final summary to chat {chat_id}: {send_final_err}", exc_info=True)
|
645 |
-
user_feedback_message = "⚠️ Sorry, an unexpected error occurred while sending the final summary."
|
646 |
-
success = False
|
647 |
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
|
652 |
# --- Send Final Feedback Message if Processing Failed ---
|
653 |
if user_feedback_message and not success:
|
654 |
-
|
655 |
-
|
656 |
await bot.send_message(chat_id=chat_id, text=user_feedback_message)
|
657 |
-
|
658 |
-
|
659 |
|
660 |
except Exception as e:
|
661 |
-
|
662 |
-
logger.error(f"[Task {task_id}] Unexpected critical error during task processing for user {user_id}, URL {url}: {e}", exc_info=True)
|
663 |
try:
|
664 |
-
await bot.send_message(
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
|
|
669 |
finally:
|
670 |
# --- Clean up Status Message(s) ---
|
671 |
-
logger.debug(f"[Task {task_id}] Cleaning up status message(s). Success={success}")
|
672 |
try:
|
673 |
if status_message_sent_id:
|
674 |
-
# If we sent a separate "Working..." message, delete it.
|
675 |
await bot.delete_message(chat_id=chat_id, message_id=status_message_sent_id)
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
except Exception as del_e:
|
694 |
-
logger.warning(f"[Task {task_id}] Could not delete status/button message during cleanup: {del_e!r}")
|
695 |
|
696 |
-
logger.info(f"[Task {task_id}]
|
697 |
|
698 |
# --- Telegram Bot Handlers ---
|
699 |
-
|
700 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
701 |
"""Handles the /start command."""
|
702 |
user = update.effective_user
|
703 |
if not user: return
|
704 |
-
logger.info(f"User {user.id}
|
705 |
mention = user.mention_html() if user.username else user.first_name
|
706 |
start_message = (
|
707 |
f"👋 Hello {mention}!\n\n"
|
@@ -717,15 +290,14 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
717 |
logger.info(f"User {user.id if user else '?'} requested /help.")
|
718 |
help_text = (
|
719 |
"**How to Use Me:**\n"
|
720 |
-
"1.
|
721 |
-
"2.
|
722 |
-
"3.
|
723 |
-
"4.
|
724 |
"**Important Notes:**\n"
|
725 |
-
"- **YouTube:** Getting transcripts can sometimes fail if they are disabled
|
726 |
-
"- **Websites:**
|
727 |
-
"- **AI Summaries:** The AI tries its best to be accurate
|
728 |
-
"- **Length:** Very long articles or videos might be truncated before summarization to fit within processing limits.\n\n"
|
729 |
"Just send a link to get started!"
|
730 |
)
|
731 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
@@ -742,9 +314,8 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
742 |
|
743 |
if match:
|
744 |
url = match.group(0)
|
745 |
-
logger.info(f"User {user.id} sent
|
746 |
context.user_data['url_to_summarize'] = url
|
747 |
-
logger.debug(f"Stored URL '{url}' in user_data for user {user.id}")
|
748 |
|
749 |
keyboard = [
|
750 |
[
|
@@ -759,123 +330,83 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
759 |
parse_mode=ParseMode.MARKDOWN,
|
760 |
link_preview_options={'is_disabled': True}
|
761 |
)
|
762 |
-
|
763 |
-
|
764 |
-
await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
|
765 |
-
|
766 |
|
767 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
768 |
-
"""Handles button presses
|
769 |
query = update.callback_query
|
770 |
if not query or not query.from_user or not query.message:
|
771 |
-
|
772 |
-
|
773 |
-
|
774 |
-
|
775 |
-
except Exception: pass
|
776 |
return
|
777 |
-
user = query.from_user
|
778 |
-
|
779 |
-
# We skip query.answer() here to avoid potential connection issues.
|
780 |
-
# The button might stay loading visually for the user.
|
781 |
|
|
|
|
|
|
|
782 |
summary_type = query.data
|
783 |
url = context.user_data.get('url_to_summarize')
|
784 |
-
query_id = query.id
|
785 |
|
786 |
-
logger.info(f"User {user.id} chose summary type '{summary_type}'
|
787 |
|
788 |
if not url:
|
789 |
-
logger.warning(f"
|
790 |
try:
|
791 |
await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
|
792 |
-
except Exception as
|
793 |
-
logger.error(f"Failed to edit message
|
794 |
-
# Still try to answer the query if editing failed
|
795 |
-
try: await query.answer("Error processing request.")
|
796 |
-
except Exception: pass
|
797 |
return
|
798 |
|
799 |
-
|
800 |
-
|
801 |
-
# Extract necessary IDs before clearing data
|
802 |
-
user_id = user.id
|
803 |
-
chat_id = query.message.chat_id
|
804 |
-
message_id_to_edit = query.message.message_id
|
805 |
-
bot_instance = context.bot # Get the bot instance from context
|
806 |
-
|
807 |
-
# Clear the URL from context *before* scheduling the task
|
808 |
context.user_data.pop('url_to_summarize', None)
|
809 |
-
logger.debug(f"Cleared URL from user_data for user {user_id} (Query {query_id})")
|
810 |
|
811 |
-
# Schedule
|
812 |
-
# Pass all required data explicitly
|
813 |
asyncio.create_task(
|
814 |
process_summary_task(
|
815 |
-
user_id=
|
816 |
-
chat_id=chat_id,
|
817 |
-
message_id_to_edit=
|
818 |
url=url,
|
819 |
summary_type=summary_type,
|
820 |
-
|
821 |
),
|
822 |
-
name=f"SummaryTask-{
|
823 |
)
|
824 |
|
825 |
-
# Log that the task was scheduled and the handler is returning.
|
826 |
-
logger.debug(f"Callback handler for Query {query_id} finished after scheduling task.")
|
827 |
-
# DO NOT await the task here. Let the handler return immediately.
|
828 |
-
|
829 |
-
|
830 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
831 |
"""Log Errors caused by Updates or background tasks."""
|
832 |
-
# Check if the error is from an Exception raised in a handler
|
833 |
if context.error:
|
834 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
835 |
-
if isinstance(context.error, TimedOut):
|
836 |
-
logger.warning("A timeout error occurred in PTB communication.")
|
837 |
-
elif isinstance(context.error, NetworkError):
|
838 |
-
logger.warning(f"A network error occurred: {context.error}")
|
839 |
-
# Add more specific error handling if needed
|
840 |
-
else:
|
841 |
-
# Log errors from background tasks if PTB captures them this way (might need custom handling)
|
842 |
-
logger.error(f"Unknown error occurred. Update: {update} | Context: {context}")
|
843 |
-
|
844 |
|
845 |
# --- Bot Setup Function ---
|
846 |
async def setup_bot_config() -> Application:
|
847 |
-
"""Configures the PTB Application
|
848 |
logger.info("Configuring Telegram Application...")
|
849 |
if not TELEGRAM_TOKEN:
|
850 |
-
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable not found.")
|
851 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
852 |
|
853 |
-
connect_timeout = 10.0
|
854 |
-
read_timeout = 30.0
|
855 |
-
write_timeout = 30.0
|
856 |
-
pool_timeout = 30.0
|
857 |
-
|
858 |
-
logger.info(f"Creating PTB HTTPXRequest (v20 compatible) with settings: "
|
859 |
-
f"connect_timeout={connect_timeout}, read_timeout={read_timeout}, "
|
860 |
-
f"write_timeout={write_timeout}, pool_timeout={pool_timeout}. "
|
861 |
-
f"(Pool size uses httpx default)")
|
862 |
-
|
863 |
custom_request = HTTPXRequest(
|
864 |
-
connect_timeout=
|
865 |
-
read_timeout=
|
866 |
-
write_timeout=
|
867 |
-
pool_timeout=
|
868 |
http_version="1.1"
|
869 |
)
|
870 |
|
871 |
-
|
872 |
-
|
873 |
-
|
|
|
|
|
|
|
874 |
|
875 |
application.add_handler(CommandHandler("start", start))
|
876 |
application.add_handler(CommandHandler("help", help_command))
|
877 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
878 |
-
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
879 |
application.add_error_handler(error_handler)
|
880 |
|
881 |
logger.info("Telegram application handlers configured.")
|
@@ -890,140 +421,78 @@ async def lifespan(app: Starlette):
|
|
890 |
|
891 |
try:
|
892 |
ptb_app = await setup_bot_config()
|
893 |
-
logger.info("PTB Application object configured. Initializing...")
|
894 |
await ptb_app.initialize()
|
895 |
-
|
896 |
-
await ptb_app.start() # Starts dispatcher, job queue, etc. but NOT polling
|
897 |
|
898 |
-
|
899 |
-
|
900 |
-
logger.info(f"PTB Application started successfully. Bot ID: {bot_info.id}, Username: @{bot_info.username}")
|
901 |
|
902 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
903 |
if WEBHOOK_URL_BASE:
|
904 |
-
if not WEBHOOK_URL_BASE.startswith("https://"):
|
|
|
905 |
webhook_path = "/webhook"
|
906 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
907 |
|
908 |
-
logger.info(f"
|
909 |
-
await asyncio.sleep(2.0)
|
910 |
try:
|
911 |
-
await
|
912 |
url=full_webhook_url,
|
913 |
allowed_updates=Update.ALL_TYPES,
|
914 |
-
|
915 |
)
|
916 |
-
webhook_info = await
|
917 |
-
|
918 |
-
logger.info(f"Telegram webhook set successfully! Current info: {webhook_info}")
|
919 |
-
elif webhook_info:
|
920 |
-
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', Got: {webhook_info.url}. Info: {webhook_info}")
|
921 |
-
else:
|
922 |
-
logger.error("Failed to get webhook info after setting webhook.")
|
923 |
-
except RetryAfter as e:
|
924 |
-
logger.warning(f"Webhook setting throttled by Telegram (RetryAfter: {e.retry_after}s). Another instance likely succeeded or try again later.")
|
925 |
-
await asyncio.sleep(e.retry_after or 2)
|
926 |
-
try:
|
927 |
-
webhook_info = await bot_instance.get_webhook_info()
|
928 |
-
logger.info(f"Webhook info after RetryAfter delay: {webhook_info}")
|
929 |
-
except Exception as get_info_err:
|
930 |
-
logger.error(f"Failed to get webhook info after RetryAfter delay: {get_info_err}")
|
931 |
except Exception as e:
|
932 |
-
|
933 |
-
else:
|
934 |
-
logger.warning("SPACE_HOST environment variable not found. Cannot set webhook automatically. Bot will not receive updates via webhook.")
|
935 |
|
936 |
-
logger.info("ASGI Lifespan: Startup complete.
|
937 |
-
yield
|
938 |
|
939 |
except Exception as startup_err:
|
940 |
-
logger.critical(f"
|
941 |
-
# Log traceback explicitly before raising might help in some environments
|
942 |
-
traceback.print_exc()
|
943 |
raise
|
944 |
finally:
|
945 |
-
# --- Shutdown Sequence ---
|
946 |
logger.info("ASGI Lifespan: Shutdown sequence initiated...")
|
947 |
if ptb_app:
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
952 |
-
|
953 |
-
|
954 |
-
await ptb_app.stop()
|
955 |
-
logger.info("Shutting down PTB Application connections and resources...")
|
956 |
-
await ptb_app.shutdown()
|
957 |
-
logger.info("PTB Application shut down gracefully.")
|
958 |
-
except Exception as shutdown_err:
|
959 |
-
logger.error(f"Error during PTB Application shutdown: {shutdown_err}", exc_info=True)
|
960 |
-
else:
|
961 |
-
logger.warning("PTB Application instance exists but was not marked as running at shutdown.")
|
962 |
-
try:
|
963 |
-
await ptb_app.shutdown()
|
964 |
-
logger.info("Attempted shutdown of non-running PTB app completed.")
|
965 |
-
except Exception as shutdown_err:
|
966 |
-
logger.error(f"Error during shutdown of non-running PTB app: {shutdown_err}", exc_info=True)
|
967 |
-
else:
|
968 |
-
logger.warning("No PTB Application instance (ptb_app) found during ASGI shutdown.")
|
969 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
970 |
|
971 |
-
|
972 |
-
# --- Flask App Setup (for Webhook Route) ---
|
973 |
flask_core_app = Flask(__name__)
|
974 |
-
logger.info("Core Flask app instance created (used by Starlette for routing).")
|
975 |
|
976 |
-
# --- Define Flask Routes ---
|
977 |
@flask_core_app.route('/')
|
978 |
def index():
|
979 |
"""Basic health check endpoint."""
|
980 |
-
|
981 |
-
bot_status = "Unknown / Not Initialized"
|
982 |
if ptb_app and ptb_app.bot:
|
983 |
-
|
984 |
-
|
985 |
-
return f"Telegram Bot Summarizer - Status: {bot_status} - Listening via Starlette/Uvicorn."
|
986 |
|
987 |
@flask_core_app.route('/webhook', methods=['POST'])
|
988 |
async def webhook() -> Response:
|
989 |
"""Webhook endpoint called by Telegram."""
|
990 |
-
global ptb_app
|
991 |
-
|
992 |
if not ptb_app:
|
993 |
-
|
994 |
-
return Response('Bot service is not configured or failed during startup.', status=503)
|
995 |
|
996 |
-
is_running = getattr(ptb_app, '_running', False)
|
997 |
-
if not is_running:
|
998 |
-
logger.error("Webhook triggered, but PTB Application is not currently running.")
|
999 |
-
return Response('Bot service is initialized but not actively running.', status=503)
|
1000 |
-
|
1001 |
-
logger.debug("Webhook endpoint received POST request from Telegram.")
|
1002 |
try:
|
1003 |
update_data = request.get_json()
|
1004 |
if not update_data:
|
1005 |
-
|
1006 |
-
return Response('Bad Request: Expected JSON payload.', status=400)
|
1007 |
|
1008 |
update = Update.de_json(update_data, ptb_app.bot)
|
1009 |
-
logger.debug(f"Processing update_id: {update.update_id} via webhook route.")
|
1010 |
-
|
1011 |
-
# Let PTB's dispatcher handle the update asynchronously
|
1012 |
-
# This will now call the appropriate handler (e.g., handle_summary_type_callback)
|
1013 |
-
# which will *quickly* schedule the background task and return.
|
1014 |
await ptb_app.process_update(update)
|
1015 |
-
|
1016 |
-
logger.debug(f"Finished processing update_id: {update.update_id} in webhook handler (task scheduled).")
|
1017 |
-
# Return 200 OK immediately to Telegram
|
1018 |
return Response('ok', status=200)
|
1019 |
|
1020 |
-
except json.JSONDecodeError:
|
1021 |
-
logger.error("Failed to decode JSON from Telegram webhook request.", exc_info=True)
|
1022 |
-
return Response('Bad Request: Invalid JSON format.', status=400)
|
1023 |
except Exception as e:
|
1024 |
-
logger.error(f"
|
1025 |
-
return Response('Internal Server Error
|
1026 |
-
|
1027 |
|
1028 |
# --- Create Starlette ASGI Application ---
|
1029 |
app = Starlette(
|
@@ -1033,25 +502,10 @@ app = Starlette(
|
|
1033 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
1034 |
]
|
1035 |
)
|
1036 |
-
logger.info("Starlette ASGI application created
|
1037 |
-
|
1038 |
|
1039 |
# --- Development Server Execution Block ---
|
1040 |
if __name__ == '__main__':
|
1041 |
-
logger.warning("
|
1042 |
-
|
1043 |
-
|
1044 |
-
logger.warning("This mode starts the Flask development server.")
|
1045 |
-
logger.warning("!!! IT DOES **NOT** RUN THE ASGI LIFESPAN !!!")
|
1046 |
-
logger.warning("!!! The Telegram Bot (PTB Application) WILL NOT INITIALIZE OR RUN !!!")
|
1047 |
-
logger.warning("This is suitable ONLY for verifying Flask routes locally.")
|
1048 |
-
logger.warning("For proper testing/deployment, use: uvicorn main:app --reload --port 8080")
|
1049 |
-
logger.warning("or via Gunicorn: gunicorn -c gunicorn.conf.py main:app")
|
1050 |
-
logger.warning("=" * 50)
|
1051 |
-
|
1052 |
-
if not TELEGRAM_TOKEN:
|
1053 |
-
logger.critical("CRITICAL: TELEGRAM_TOKEN environment variable missing. Aborting direct Flask start.")
|
1054 |
-
else:
|
1055 |
-
local_port = int(os.environ.get('PORT', 8080))
|
1056 |
-
logger.info(f"Starting Flask development server on http://0.0.0.0:{local_port}")
|
1057 |
-
flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
|
|
|
1 |
+
# main.py (Revised with background task connection fixes)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
5 |
import asyncio
|
6 |
import json
|
7 |
+
import html
|
8 |
+
import contextlib
|
9 |
+
import traceback
|
10 |
+
from typing import Optional
|
11 |
|
12 |
# --- Frameworks ---
|
13 |
+
from flask import Flask, request, Response
|
14 |
+
from starlette.applications import Starlette
|
15 |
+
from starlette.routing import Mount
|
16 |
+
from starlette.middleware.wsgi import WSGIMiddleware
|
17 |
|
18 |
# --- Telegram Bot ---
|
19 |
+
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
|
20 |
from telegram.ext import (
|
21 |
Application,
|
22 |
CommandHandler,
|
|
|
26 |
CallbackQueryHandler,
|
27 |
)
|
28 |
from telegram.constants import ParseMode
|
29 |
+
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest
|
30 |
from telegram.request import HTTPXRequest
|
31 |
|
32 |
# --- Other Libraries ---
|
|
|
34 |
from youtube_transcript_api import YouTubeTranscriptApi
|
35 |
import requests
|
36 |
from bs4 import BeautifulSoup
|
37 |
+
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
38 |
+
|
39 |
_apify_token_exists = bool(os.environ.get('APIFY_API_TOKEN'))
|
40 |
if _apify_token_exists:
|
41 |
from apify_client import ApifyClient
|
|
|
58 |
logger = logging.getLogger(__name__)
|
59 |
logger.info("Logging configured.")
|
60 |
|
61 |
+
# --- Global variable for PTB app ---
|
62 |
+
ptb_app: Optional[Application] = None
|
63 |
|
64 |
# --- Environment Variable Loading ---
|
65 |
logger.info("Attempting to load secrets...")
|
|
|
76 |
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
77 |
logger.info("Secret loading attempt finished.")
|
78 |
|
79 |
+
# --- Retry Decorator for Bot Operations ---
|
80 |
+
def retry_bot_operation(func):
|
81 |
+
@retry(
|
82 |
+
stop=stop_after_attempt(3),
|
83 |
+
wait=wait_exponential(multiplier=1, min=1, max=10),
|
84 |
+
retry=retry_if_exception_type((NetworkError, RuntimeError)),
|
85 |
+
before_sleep=lambda retry_state: logger.warning(
|
86 |
+
f"Retrying bot operation due to {retry_state.outcome.exception()}. "
|
87 |
+
f"Attempt {retry_state.attempt_number}/3"
|
88 |
+
)
|
89 |
+
)
|
90 |
+
async def wrapper(*args, **kwargs):
|
91 |
+
try:
|
92 |
+
return await func(*args, **kwargs)
|
93 |
+
except Exception as e:
|
94 |
+
logger.error(f"Operation failed after retries: {e}")
|
95 |
+
raise
|
96 |
+
return wrapper
|
97 |
|
98 |
+
# --- Helper Functions (unchanged from your original) ---
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
def is_youtube_url(url):
|
100 |
"""Checks if the URL is a valid YouTube video or shorts URL."""
|
101 |
youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|shorts/)|youtu\.be/)([\w-]{11})'
|
|
|
115 |
logger.warning(f"Could not extract YouTube ID from URL: {url}")
|
116 |
return None
|
117 |
|
118 |
+
# --- Content Fetching Functions (unchanged from your original) ---
|
119 |
+
# [Keep all your existing content fetching functions exactly as they were]
|
120 |
+
# get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript,
|
121 |
+
# get_website_content_via_requests, get_website_content_via_urltotext_api, generate_summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
+
# --- Revised Background Task Processing ---
|
124 |
async def process_summary_task(
|
125 |
user_id: int,
|
126 |
chat_id: int,
|
127 |
message_id_to_edit: int,
|
128 |
url: str,
|
129 |
summary_type: str,
|
130 |
+
bot_token: str # Now receiving token instead of bot instance
|
131 |
) -> None:
|
132 |
"""Handles the actual fetching and summarization in a background task."""
|
133 |
+
task_id = f"{user_id}-{message_id_to_edit}"
|
134 |
+
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
135 |
+
|
136 |
+
# Create a new bot instance for this task
|
137 |
+
bot = Bot(token=bot_token)
|
138 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
try:
|
140 |
+
# --- Inform User Processing Has Started ---
|
141 |
+
processing_message_text = f"⏳ Working on your '{summary_type}' summary for the link...\n_(This might take up to a minute depending on the content)_"
|
142 |
+
status_message_sent_id = None
|
143 |
+
|
144 |
+
@retry_bot_operation
|
145 |
+
async def edit_or_send_status():
|
146 |
+
nonlocal status_message_sent_id, message_id_to_edit
|
147 |
+
try:
|
148 |
+
await bot.edit_message_text(
|
149 |
+
chat_id=chat_id,
|
150 |
+
message_id=message_id_to_edit,
|
151 |
+
text=processing_message_text
|
152 |
+
)
|
153 |
+
logger.debug(f"[Task {task_id}] Successfully edited message {message_id_to_edit}")
|
154 |
+
except (TimedOut, NetworkError, BadRequest) as e:
|
155 |
+
logger.warning(f"[Task {task_id}] Could not edit original message: {e}. Sending new status message.")
|
156 |
+
message_id_to_edit = None
|
157 |
+
status_message = await bot.send_message(
|
158 |
+
chat_id=chat_id,
|
159 |
+
text=processing_message_text
|
160 |
+
)
|
161 |
+
status_message_sent_id = status_message.message_id
|
162 |
+
logger.debug(f"[Task {task_id}] Sent new status message {status_message_sent_id}")
|
|
|
|
|
|
|
163 |
|
164 |
+
await edit_or_send_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
+
# --- Main Content Fetching and Summarization ---
|
167 |
+
content = None
|
168 |
+
user_feedback_message = None
|
169 |
+
success = False
|
170 |
|
171 |
+
try:
|
172 |
+
# Send 'typing' action
|
173 |
+
@retry_bot_operation
|
174 |
+
async def send_typing():
|
175 |
+
await bot.send_chat_action(chat_id=chat_id, action='typing')
|
176 |
+
|
177 |
+
await send_typing()
|
178 |
+
|
179 |
+
# --- Determine Content Type and Fetch ---
|
180 |
+
is_yt = is_youtube_url(url)
|
181 |
+
logger.debug(f"[Task {task_id}] URL is YouTube: {is_yt}")
|
182 |
+
|
183 |
+
if is_yt:
|
184 |
+
video_id = extract_youtube_id(url)
|
185 |
+
if video_id:
|
186 |
+
logger.info(f"[Task {task_id}] Fetching YouTube transcript for {video_id}")
|
187 |
+
content = await get_youtube_transcript(
|
188 |
+
video_id,
|
189 |
+
url,
|
190 |
+
SUPADATA_API_KEY,
|
191 |
+
APIFY_API_TOKEN
|
192 |
+
)
|
193 |
+
if not content:
|
194 |
+
user_feedback_message = "⚠️ Sorry, I couldn't retrieve the transcript for that YouTube video."
|
195 |
else:
|
196 |
+
logger.info(f"[Task {task_id}] Attempting website scrape for: {url}")
|
197 |
+
content = await get_website_content_via_requests(url)
|
198 |
+
if not content and URLTOTEXT_API_KEY:
|
199 |
+
await send_typing()
|
200 |
+
content = await get_website_content_via_urltotext_api(url, URLTOTEXT_API_KEY)
|
201 |
+
if not content:
|
202 |
+
user_feedback_message = "⚠️ Sorry, I couldn't fetch the content from that website."
|
203 |
+
|
204 |
+
# --- Generate Summary if Content Was Fetched ---
|
205 |
if content:
|
206 |
+
logger.info(f"[Task {task_id}] Generating '{summary_type}' summary")
|
207 |
+
await send_typing()
|
208 |
+
|
209 |
+
summary = await generate_summary(content, summary_type, OPENROUTER_API_KEY)
|
|
|
|
|
210 |
|
211 |
+
if summary.startswith("Error:") or summary.startswith("Sorry,"):
|
212 |
+
user_feedback_message = f"⚠️ {summary}"
|
|
|
|
|
|
|
|
|
|
|
213 |
else:
|
214 |
+
@retry_bot_operation
|
215 |
+
async def send_summary():
|
216 |
+
await bot.send_message(
|
217 |
+
chat_id=chat_id,
|
218 |
+
text=summary,
|
219 |
+
parse_mode=ParseMode.MARKDOWN,
|
220 |
+
link_preview_options={'is_disabled': True}
|
221 |
+
)
|
222 |
+
|
223 |
+
await send_summary()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
success = True
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
+
except Exception as e:
|
227 |
+
logger.error(f"[Task {task_id}] Error during processing: {e}", exc_info=True)
|
228 |
+
user_feedback_message = "❌ An unexpected error occurred while processing your request."
|
229 |
|
230 |
# --- Send Final Feedback Message if Processing Failed ---
|
231 |
if user_feedback_message and not success:
|
232 |
+
@retry_bot_operation
|
233 |
+
async def send_feedback():
|
234 |
await bot.send_message(chat_id=chat_id, text=user_feedback_message)
|
235 |
+
|
236 |
+
await send_feedback()
|
237 |
|
238 |
except Exception as e:
|
239 |
+
logger.error(f"[Task {task_id}] Critical error in task: {e}", exc_info=True)
|
|
|
240 |
try:
|
241 |
+
await bot.send_message(
|
242 |
+
chat_id=chat_id,
|
243 |
+
text="❌ A critical error occurred. Please try again later."
|
244 |
+
)
|
245 |
+
except Exception:
|
246 |
+
pass
|
247 |
finally:
|
248 |
# --- Clean up Status Message(s) ---
|
|
|
249 |
try:
|
250 |
if status_message_sent_id:
|
|
|
251 |
await bot.delete_message(chat_id=chat_id, message_id=status_message_sent_id)
|
252 |
+
elif message_id_to_edit and success:
|
253 |
+
await bot.delete_message(chat_id=chat_id, message_id=message_id_to_edit)
|
254 |
+
elif message_id_to_edit and not success:
|
255 |
+
final_error_text = user_feedback_message or "❌ An error occurred."
|
256 |
+
await bot.edit_message_text(
|
257 |
+
chat_id=chat_id,
|
258 |
+
message_id=message_id_to_edit,
|
259 |
+
text=final_error_text[:4090]
|
260 |
+
)
|
261 |
+
except Exception as e:
|
262 |
+
logger.warning(f"[Task {task_id}] Cleanup error: {e}")
|
263 |
+
|
264 |
+
# Ensure bot session is closed
|
265 |
+
try:
|
266 |
+
await bot.session.close()
|
267 |
+
except Exception:
|
268 |
+
pass
|
|
|
|
|
269 |
|
270 |
+
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
271 |
|
272 |
# --- Telegram Bot Handlers ---
|
|
|
273 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
274 |
"""Handles the /start command."""
|
275 |
user = update.effective_user
|
276 |
if not user: return
|
277 |
+
logger.info(f"User {user.id} initiated /start.")
|
278 |
mention = user.mention_html() if user.username else user.first_name
|
279 |
start_message = (
|
280 |
f"👋 Hello {mention}!\n\n"
|
|
|
290 |
logger.info(f"User {user.id if user else '?'} requested /help.")
|
291 |
help_text = (
|
292 |
"**How to Use Me:**\n"
|
293 |
+
"1. Send me a direct link (URL) to a YouTube video or a web article.\n"
|
294 |
+
"2. I will ask you to choose the summary format: `Paragraph` or `Points`.\n"
|
295 |
+
"3. Click the button for your preferred format.\n"
|
296 |
+
"4. I'll fetch the content, summarise it using AI, and send it back to you!\n\n"
|
297 |
"**Important Notes:**\n"
|
298 |
+
"- **YouTube:** Getting transcripts can sometimes fail if they are disabled or unavailable.\n"
|
299 |
+
"- **Websites:** Complex websites might not work perfectly.\n"
|
300 |
+
"- **AI Summaries:** The AI tries its best to be accurate.\n\n"
|
|
|
301 |
"Just send a link to get started!"
|
302 |
)
|
303 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
|
|
314 |
|
315 |
if match:
|
316 |
url = match.group(0)
|
317 |
+
logger.info(f"User {user.id} sent URL: {url}")
|
318 |
context.user_data['url_to_summarize'] = url
|
|
|
319 |
|
320 |
keyboard = [
|
321 |
[
|
|
|
330 |
parse_mode=ParseMode.MARKDOWN,
|
331 |
link_preview_options={'is_disabled': True}
|
332 |
)
|
333 |
+
elif not message_text.startswith('/'):
|
334 |
+
await update.message.reply_text("Please send me a valid URL (starting with http:// or https://) to summarize.")
|
|
|
|
|
335 |
|
336 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
337 |
+
"""Handles button presses for summary type selection."""
|
338 |
query = update.callback_query
|
339 |
if not query or not query.from_user or not query.message:
|
340 |
+
try:
|
341 |
+
await query.answer()
|
342 |
+
except:
|
343 |
+
pass
|
|
|
344 |
return
|
|
|
|
|
|
|
|
|
345 |
|
346 |
+
await query.answer() # Acknowledge the button press immediately
|
347 |
+
|
348 |
+
user = query.from_user
|
349 |
summary_type = query.data
|
350 |
url = context.user_data.get('url_to_summarize')
|
351 |
+
query_id = query.id
|
352 |
|
353 |
+
logger.info(f"User {user.id} chose summary type '{summary_type}'")
|
354 |
|
355 |
if not url:
|
356 |
+
logger.warning(f"No URL found for user {user.id}")
|
357 |
try:
|
358 |
await query.edit_message_text(text="⚠️ Oops! I lost the context for that link. Please send the link again.")
|
359 |
+
except Exception as e:
|
360 |
+
logger.error(f"Failed to edit message: {e}")
|
|
|
|
|
|
|
361 |
return
|
362 |
|
363 |
+
# Clear the URL from context
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
context.user_data.pop('url_to_summarize', None)
|
|
|
365 |
|
366 |
+
# Schedule background task with token instead of bot instance
|
|
|
367 |
asyncio.create_task(
|
368 |
process_summary_task(
|
369 |
+
user_id=user.id,
|
370 |
+
chat_id=query.message.chat_id,
|
371 |
+
message_id_to_edit=query.message.message_id,
|
372 |
url=url,
|
373 |
summary_type=summary_type,
|
374 |
+
bot_token=TELEGRAM_TOKEN
|
375 |
),
|
376 |
+
name=f"SummaryTask-{user.id}-{query.message.message_id}"
|
377 |
)
|
378 |
|
|
|
|
|
|
|
|
|
|
|
379 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
380 |
"""Log Errors caused by Updates or background tasks."""
|
|
|
381 |
if context.error:
|
382 |
logger.error(f"Exception while handling an update: {context.error}", exc_info=context.error)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
|
384 |
# --- Bot Setup Function ---
|
385 |
async def setup_bot_config() -> Application:
|
386 |
+
"""Configures the PTB Application."""
|
387 |
logger.info("Configuring Telegram Application...")
|
388 |
if not TELEGRAM_TOKEN:
|
|
|
389 |
raise ValueError("TELEGRAM_TOKEN environment variable not set.")
|
390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
custom_request = HTTPXRequest(
|
392 |
+
connect_timeout=10.0,
|
393 |
+
read_timeout=30.0,
|
394 |
+
write_timeout=30.0,
|
395 |
+
pool_timeout=30.0,
|
396 |
http_version="1.1"
|
397 |
)
|
398 |
|
399 |
+
application = (
|
400 |
+
Application.builder()
|
401 |
+
.token(TELEGRAM_TOKEN)
|
402 |
+
.request(custom_request)
|
403 |
+
.build()
|
404 |
+
)
|
405 |
|
406 |
application.add_handler(CommandHandler("start", start))
|
407 |
application.add_handler(CommandHandler("help", help_command))
|
408 |
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url))
|
409 |
+
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
410 |
application.add_error_handler(error_handler)
|
411 |
|
412 |
logger.info("Telegram application handlers configured.")
|
|
|
421 |
|
422 |
try:
|
423 |
ptb_app = await setup_bot_config()
|
|
|
424 |
await ptb_app.initialize()
|
425 |
+
await ptb_app.start()
|
|
|
426 |
|
427 |
+
bot_info = await ptb_app.bot.get_me()
|
428 |
+
logger.info(f"Bot started: @{bot_info.username}")
|
|
|
429 |
|
430 |
WEBHOOK_URL_BASE = os.environ.get("SPACE_HOST")
|
431 |
if WEBHOOK_URL_BASE:
|
432 |
+
if not WEBHOOK_URL_BASE.startswith("https://"):
|
433 |
+
WEBHOOK_URL_BASE = f"https://{WEBHOOK_URL_BASE}"
|
434 |
webhook_path = "/webhook"
|
435 |
full_webhook_url = f"{WEBHOOK_URL_BASE.rstrip('/')}{webhook_path}"
|
436 |
|
437 |
+
logger.info(f"Setting webhook to: {full_webhook_url}")
|
438 |
+
await asyncio.sleep(2.0)
|
439 |
try:
|
440 |
+
await ptb_app.bot.set_webhook(
|
441 |
url=full_webhook_url,
|
442 |
allowed_updates=Update.ALL_TYPES,
|
443 |
+
drop_pending_updates=True
|
444 |
)
|
445 |
+
webhook_info = await ptb_app.bot.get_webhook_info()
|
446 |
+
logger.info(f"Webhook set: {webhook_info}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
except Exception as e:
|
448 |
+
logger.error(f"Failed to set webhook: {e}")
|
|
|
|
|
449 |
|
450 |
+
logger.info("ASGI Lifespan: Startup complete.")
|
451 |
+
yield
|
452 |
|
453 |
except Exception as startup_err:
|
454 |
+
logger.critical(f"Startup error: {startup_err}", exc_info=True)
|
|
|
|
|
455 |
raise
|
456 |
finally:
|
|
|
457 |
logger.info("ASGI Lifespan: Shutdown sequence initiated...")
|
458 |
if ptb_app:
|
459 |
+
try:
|
460 |
+
await ptb_app.stop()
|
461 |
+
await ptb_app.shutdown()
|
462 |
+
logger.info("PTB Application shut down gracefully.")
|
463 |
+
except Exception as shutdown_err:
|
464 |
+
logger.error(f"Shutdown error: {shutdown_err}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
466 |
|
467 |
+
# --- Flask App Setup ---
|
|
|
468 |
flask_core_app = Flask(__name__)
|
|
|
469 |
|
|
|
470 |
@flask_core_app.route('/')
|
471 |
def index():
|
472 |
"""Basic health check endpoint."""
|
473 |
+
bot_status = "Unknown"
|
|
|
474 |
if ptb_app and ptb_app.bot:
|
475 |
+
bot_status = f"Running (@{ptb_app.bot.username})"
|
476 |
+
return f"Telegram Bot Summarizer - Status: {bot_status}"
|
|
|
477 |
|
478 |
@flask_core_app.route('/webhook', methods=['POST'])
|
479 |
async def webhook() -> Response:
|
480 |
"""Webhook endpoint called by Telegram."""
|
|
|
|
|
481 |
if not ptb_app:
|
482 |
+
return Response('Bot not initialized', status=503)
|
|
|
483 |
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
try:
|
485 |
update_data = request.get_json()
|
486 |
if not update_data:
|
487 |
+
return Response('Bad Request', status=400)
|
|
|
488 |
|
489 |
update = Update.de_json(update_data, ptb_app.bot)
|
|
|
|
|
|
|
|
|
|
|
490 |
await ptb_app.process_update(update)
|
|
|
|
|
|
|
491 |
return Response('ok', status=200)
|
492 |
|
|
|
|
|
|
|
493 |
except Exception as e:
|
494 |
+
logger.error(f"Webhook error: {e}")
|
495 |
+
return Response('Internal Server Error', status=500)
|
|
|
496 |
|
497 |
# --- Create Starlette ASGI Application ---
|
498 |
app = Starlette(
|
|
|
502 |
Mount("/", app=WSGIMiddleware(flask_core_app))
|
503 |
]
|
504 |
)
|
505 |
+
logger.info("Starlette ASGI application created.")
|
|
|
506 |
|
507 |
# --- Development Server Execution Block ---
|
508 |
if __name__ == '__main__':
|
509 |
+
logger.warning("Running in development mode (Flask server only)")
|
510 |
+
local_port = int(os.environ.get('PORT', 8080))
|
511 |
+
flask_core_app.run(host='0.0.0.0', port=local_port, debug=True, use_reloader=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|