Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# main.py (
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
@@ -30,7 +30,7 @@ from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, Teleg
|
|
30 |
from telegram.request import HTTPXRequest, BaseRequest
|
31 |
|
32 |
# --- Other Libraries ---
|
33 |
-
import httpx
|
34 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
35 |
from bs4 import BeautifulSoup
|
36 |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log
|
@@ -40,15 +40,12 @@ try:
|
|
40 |
except ImportError:
|
41 |
DEFAULT_PARSER = 'html.parser'
|
42 |
|
43 |
-
# NOTE: apify-client is NOT used, as we replicate the REST API call from Colab
|
44 |
-
|
45 |
# --- Logging Setup ---
|
46 |
logging.basicConfig(
|
47 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
48 |
level=logging.INFO
|
49 |
)
|
50 |
logging.getLogger("httpx").setLevel(logging.WARNING)
|
51 |
-
# No apify_client logger needed
|
52 |
logging.getLogger("telegram.ext").setLevel(logging.INFO)
|
53 |
logging.getLogger('telegram.bot').setLevel(logging.INFO)
|
54 |
logging.getLogger("urllib3").setLevel(logging.INFO)
|
@@ -72,36 +69,28 @@ def get_secret(secret_name):
|
|
72 |
|
73 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
74 |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY')
|
75 |
-
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
|
76 |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
77 |
-
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
78 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
79 |
|
80 |
-
# Configuration matching Colab script
|
81 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
|
82 |
-
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo/youtube-transcripts")
|
83 |
-
|
84 |
-
|
85 |
-
if not
|
86 |
-
|
87 |
-
|
88 |
-
if not
|
89 |
-
|
90 |
-
# Allow running without summary capability? For now, we'll let it run but log error.
|
91 |
-
# raise RuntimeError("Exiting: OpenRouter key missing.")
|
92 |
-
|
93 |
-
# Log warnings for optional keys (used in fallbacks)
|
94 |
-
if not URLTOTEXT_API_KEY: logger.warning("⚠️ WARNING: URLTOTEXT_API_KEY not found. Fallback website scraping unavailable.")
|
95 |
-
if not SUPADATA_API_KEY: logger.warning("⚠️ WARNING: SUPADATA_API_KEY not found. First YT transcript fallback unavailable.")
|
96 |
-
if not APIFY_API_TOKEN: logger.warning("⚠️ WARNING: APIFY_API_TOKEN not found. Second YT transcript fallback unavailable.")
|
97 |
|
98 |
logger.info("Secret loading and configuration check finished.")
|
99 |
logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}")
|
100 |
logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}")
|
101 |
|
102 |
-
_apify_token_exists = bool(APIFY_API_TOKEN)
|
103 |
|
104 |
-
# --- Retry Decorator
|
105 |
@retry( stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=15),
|
106 |
retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)),
|
107 |
before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True )
|
@@ -114,7 +103,7 @@ async def retry_bot_operation(func, *args, **kwargs):
|
|
114 |
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise
|
115 |
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise
|
116 |
|
117 |
-
# --- Helper Functions
|
118 |
def is_youtube_url(url):
|
119 |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
|
120 |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match)
|
@@ -124,16 +113,9 @@ def extract_youtube_id(url):
|
|
124 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
125 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
126 |
|
127 |
-
# --- Content Fetching Functions
|
128 |
-
|
129 |
-
# Generic fetcher used by website scraping (similar to previous version)
|
130 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
131 |
-
|
132 |
-
headers = { # Headers from Colab script
|
133 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
134 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
135 |
-
'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1'
|
136 |
-
}
|
137 |
try:
|
138 |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
139 |
logger.debug(f"[Web Scrape] Sending request to {url}")
|
@@ -141,106 +123,81 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
141 |
logger.debug(f"[Web Scrape] Received response {response.status_code} from {url}")
|
142 |
response.raise_for_status()
|
143 |
content_type = response.headers.get('content-type', '').lower()
|
144 |
-
if 'html' not in content_type:
|
145 |
-
|
146 |
-
return None
|
147 |
-
try: return response.text # Let httpx handle encoding
|
148 |
except Exception as e: logger.error(f"[Web Scrape] Error decoding response for {url}: {e}"); return None
|
149 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape] HTTP error {e.response.status_code} fetching {url}: {e}")
|
150 |
except httpx.TimeoutException: logger.error(f"[Web Scrape] Timeout error fetching {url}")
|
151 |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape] Too many redirects fetching {url}")
|
152 |
-
except httpx.RequestError as e: logger.error(f"[Web Scrape] Request error fetching {url}: {e}")
|
153 |
except Exception as e: logger.error(f"[Web Scrape] Unexpected error fetching {url}: {e}", exc_info=True)
|
154 |
return None
|
155 |
|
156 |
-
# --- YT Transcript Fetching ---
|
157 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
158 |
-
"""Fetches YouTube transcript using Supadata API (matching Colab endpoint)."""
|
159 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
160 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
161 |
logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
|
162 |
-
# Colab script uses /v1/youtube/transcript
|
163 |
api_endpoint = "https://api.supadata.net/v1/youtube/transcript"
|
164 |
-
params = {"videoId": video_id, "format": "text"}
|
165 |
-
headers = {"X-API-Key": api_key}
|
166 |
try:
|
167 |
async with httpx.AsyncClient(timeout=30.0) as client:
|
168 |
response = await client.get(api_endpoint, headers=headers, params=params)
|
169 |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
|
170 |
if response.status_code == 200:
|
171 |
try:
|
172 |
-
# Try JSON first, then plain text as fallback (Colab logic)
|
173 |
try: data = response.json()
|
174 |
except json.JSONDecodeError: data = None
|
175 |
content = None
|
176 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
177 |
-
if not content and response.text: content = response.text
|
178 |
-
if content and isinstance(content, str):
|
179 |
-
logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}")
|
180 |
-
return content.strip()
|
181 |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
|
182 |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
|
183 |
-
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None
|
184 |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None
|
185 |
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None
|
186 |
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None
|
187 |
-
except httpx.RequestError as e: logger.error(f"[Supadata] Request error for {video_id}: {e}"); return None
|
188 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
189 |
|
190 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
191 |
-
|
192 |
-
global APIFY_ACTOR_ID # Use globally configured actor
|
193 |
if not video_url: logger.error("[Apify] No video_url provided"); return None
|
194 |
if not api_token: logger.error("[Apify] API token missing."); return None
|
195 |
logger.info(f"[Apify] Attempting fetch via REST for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
|
196 |
api_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
|
197 |
params = {"token": api_token}
|
198 |
-
|
199 |
-
payload = {
|
200 |
-
"urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5,
|
201 |
-
"channelHandleBoolean": False, "channelNameBoolean": False,
|
202 |
-
"datePublishedBoolean": False, "relativeDateTextBoolean": False,
|
203 |
-
# Add proxy config here if needed and configured via env vars
|
204 |
-
# "proxyOptions": { "useApifyProxy": True, "apifyProxyGroups": ["YOUR_PROXY_GROUP_IF_ANY"] },
|
205 |
-
}
|
206 |
headers = {"Content-Type": "application/json"}
|
207 |
try:
|
208 |
-
async with httpx.AsyncClient(timeout=90.0) as client:
|
209 |
logger.debug(f"[Apify] Sending request to run actor {APIFY_ACTOR_ID} synchronously for {video_url}")
|
210 |
-
response = await client.post(api_endpoint, headers=headers, params=params, json=payload)
|
211 |
logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
|
212 |
if response.status_code == 200:
|
213 |
try:
|
214 |
results = response.json()
|
215 |
if isinstance(results, list) and len(results) > 0:
|
216 |
item = results[0]
|
217 |
-
# Parsing logic from Colab script
|
218 |
content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
|
219 |
-
if not content and item.get("captions") and isinstance(item["captions"], list):
|
220 |
-
|
221 |
-
content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
|
222 |
-
if content and isinstance(content, str):
|
223 |
-
logger.info(f"[Apify] Success via REST for {video_url}. Length: {len(content)}")
|
224 |
-
return content.strip()
|
225 |
else: logger.warning(f"[Apify] Actor success but transcript empty/not found for {video_url}. Item: {item}"); return None
|
226 |
else: logger.warning(f"[Apify] Actor success but dataset empty for {video_url}. Response: {results}"); return None
|
227 |
except json.JSONDecodeError: logger.error(f"[Apify] Failed JSON decode for {video_url}. Status:{response.status_code}. Resp:{response.text[:200]}"); return None
|
228 |
except Exception as e: logger.error(f"[Apify] Error processing success response for {video_url}: {e}", exc_info=True); return None
|
229 |
elif response.status_code == 400: logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None
|
230 |
-
elif response.status_code == 401: logger.error("[Apify] Auth error (401). Check token."); return None
|
231 |
else: logger.error(f"[Apify] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None
|
232 |
except httpx.TimeoutException: logger.error(f"[Apify] Timeout running actor for {video_url}"); return None
|
233 |
except httpx.RequestError as e: logger.error(f"[Apify] Request error running actor for {video_url}: {e}"); return None
|
234 |
except Exception as e: logger.error(f"[Apify] Unexpected error during Apify REST call for {video_url}: {e}", exc_info=True); return None
|
235 |
|
236 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
237 |
-
|
238 |
-
global SUPADATA_API_KEY, APIFY_API_TOKEN # Access globally loaded keys
|
239 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
240 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
241 |
transcript_text = None
|
242 |
-
|
243 |
-
# 1. Primary: youtube-transcript-api
|
244 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
245 |
try:
|
246 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
@@ -252,8 +209,6 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
252 |
if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found.")
|
253 |
elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled.")
|
254 |
transcript_text = None
|
255 |
-
|
256 |
-
# 2. Fallback 1: Supadata API
|
257 |
if transcript_text is None:
|
258 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
259 |
if SUPADATA_API_KEY:
|
@@ -261,8 +216,6 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
261 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
262 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
263 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
264 |
-
|
265 |
-
# 3. Fallback 2: Apify REST API
|
266 |
if transcript_text is None:
|
267 |
logger.info("[Fallback YT 2] Trying Apify REST API...")
|
268 |
if APIFY_API_TOKEN:
|
@@ -270,25 +223,18 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
270 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify REST for {video_url}"); return transcript_text
|
271 |
else: logger.warning(f"[Fallback YT 2] Apify REST failed or no content for {video_url}.")
|
272 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
273 |
-
|
274 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
275 |
-
return transcript_text
|
276 |
|
277 |
-
# --- Website Content Fetching ---
|
278 |
async def get_website_content(url: str) -> Optional[str]:
|
279 |
-
"""Primary: Scrapes website using httpx + BeautifulSoup (logic from Colab)."""
|
280 |
if not url: logger.error("get_website_content: No URL"); return None
|
281 |
logger.info(f"[Primary Web] Fetching website content for: {url}")
|
282 |
html_content = await fetch_url_content_for_scrape(url)
|
283 |
if not html_content: return None
|
284 |
try:
|
285 |
def parse_html(content):
|
286 |
-
# Use lxml if available, otherwise html.parser
|
287 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
288 |
-
|
289 |
-
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure"]):
|
290 |
-
element.extract()
|
291 |
-
# Content finding logic from Colab script
|
292 |
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
|
293 |
target_element = main_content if main_content else soup.body
|
294 |
if not target_element: logger.warning(f"[Primary Web] Could not find body/main for parsing {url}"); return None
|
@@ -296,94 +242,52 @@ async def get_website_content(url: str) -> Optional[str]:
|
|
296 |
text = " ".join(lines)
|
297 |
if not text: logger.warning(f"[Primary Web] Extracted text empty after clean for {url}"); return None
|
298 |
return text
|
299 |
-
|
300 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
301 |
if text_content: logger.info(f"[Primary Web] Success scrape for {url} (final len: {len(text_content)})"); return text_content
|
302 |
else: return None
|
303 |
except Exception as e: logger.error(f"[Primary Web] Error scraping/parsing {url}: {e}", exc_info=True); return None
|
304 |
|
305 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
306 |
-
"""Fallback: Fetches website content using urltotext.com API (Colab endpoint)."""
|
307 |
if not url: logger.error("[Fallback Web API] No URL"); return None
|
308 |
if not api_key: logger.error("[Fallback Web API] urltotext.com API key missing."); return None
|
309 |
logger.info(f"[Fallback Web API] Attempting fetch for: {url} using urltotext.com API")
|
310 |
-
# Endpoint and payload from Colab script
|
311 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
312 |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False }
|
313 |
-
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
|
314 |
try:
|
315 |
-
async with httpx.AsyncClient(timeout=45.0) as client:
|
316 |
logger.debug(f"[Fallback Web API] Sending request to urltotext.com API for {url}")
|
317 |
response = await client.post(api_endpoint, headers=headers, json=payload)
|
318 |
logger.debug(f"[Fallback Web API] Received status {response.status_code} from urltotext.com API for {url}")
|
319 |
if response.status_code == 200:
|
320 |
try:
|
321 |
data = response.json()
|
322 |
-
content = data.get("data", {}).get("content")
|
323 |
-
credits = data.get("credits_used", "N/A")
|
324 |
-
warning = data.get("data", {}).get("warning")
|
325 |
if warning: logger.warning(f"[Fallback Web API] urltotext.com API Warning for {url}: {warning}")
|
326 |
if content: logger.info(f"[Fallback Web API] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip()
|
327 |
else: logger.warning(f"[Fallback Web API] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
|
328 |
except json.JSONDecodeError: logger.error(f"[Fallback Web API] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
|
329 |
except Exception as e: logger.error(f"[Fallback Web API] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
|
330 |
-
# Error codes from Colab script check
|
331 |
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Fallback Web API] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
332 |
else: logger.error(f"[Fallback Web API] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
333 |
except httpx.TimeoutException: logger.error(f"[Fallback Web API] Timeout connecting to urltotext.com API for {url}"); return None
|
334 |
except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
|
335 |
except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
336 |
|
337 |
-
# --- Summarization Function
|
338 |
async def generate_summary(text: str, summary_type: str) -> str:
|
339 |
-
|
340 |
-
global OPENROUTER_API_KEY, OPENROUTER_MODEL # Use globally loaded config
|
341 |
logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
|
342 |
if not OPENROUTER_API_KEY: logger.error("OpenRouter key missing for generate_summary."); return "Error: AI model configuration key missing."
|
343 |
-
|
344 |
-
|
345 |
-
if summary_type == "paragraph":
|
346 |
-
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n"
|
347 |
-
"• Clear and simple language suitable for someone unfamiliar with the topic.\n"
|
348 |
-
"• Uses British English spellings throughout.\n"
|
349 |
-
"• Straightforward and understandable vocabulary; avoid complex terms.\n"
|
350 |
-
"• Presented as ONE SINGLE PARAGRAPH.\n"
|
351 |
-
"• No more than 85 words maximum; but does not have to be exactly 85.\n"
|
352 |
-
"• Considers the entire text content equally.\n"
|
353 |
-
"• Uses semicolons (;) instead of em dashes (– or —).\n\n"
|
354 |
-
"Here is the text to summarise:")
|
355 |
-
else: # points
|
356 |
-
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this Markdown format:\n\n"
|
357 |
-
"• For each distinct topic or section identified in the text, create a heading.\n"
|
358 |
-
"• Each heading MUST be enclosed in double asterisks for bolding (e.g., **Section Title**).\n"
|
359 |
-
"• Immediately following each heading, list the key points as a bulleted list.\n"
|
360 |
-
"• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n"
|
361 |
-
"• The text within each bullet point should NOT contain any bold formatting.\n"
|
362 |
-
"• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n"
|
363 |
-
"• Use British English spellings throughout.\n"
|
364 |
-
"• Avoid overly complex or advanced vocabulary.\n"
|
365 |
-
"• Keep bullet points concise.\n"
|
366 |
-
"• Ensure the entire summary takes no more than two minutes to read.\n"
|
367 |
-
"• Consider the entire text's content, not just the beginning or a few topics.\n"
|
368 |
-
"• Use semicolons (;) instead of em dashes (– or —).\n\n"
|
369 |
-
"Here is the text to summarise:")
|
370 |
-
|
371 |
-
# Limit input length (Colab script used 500k, adjust if needed)
|
372 |
MAX_INPUT_LENGTH = 500000
|
373 |
-
if len(text) > MAX_INPUT_LENGTH:
|
374 |
-
logger.warning(f"Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating.")
|
375 |
-
text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
|
376 |
full_prompt = f"{prompt}\n\n{text}"
|
377 |
-
|
378 |
-
headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" }
|
379 |
-
payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] }
|
380 |
-
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
381 |
-
|
382 |
try:
|
383 |
-
async with httpx.AsyncClient(timeout=60.0) as client:
|
384 |
-
logger.debug(f"Sending request to OpenRouter ({OPENROUTER_MODEL})...")
|
385 |
-
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
|
386 |
-
logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
387 |
if response.status_code == 200:
|
388 |
try:
|
389 |
data = response.json()
|
@@ -397,7 +301,6 @@ async def generate_summary(text: str, summary_type: str) -> str:
|
|
397 |
else: logger.error(f"Unexpected choices structure in OpenRouter resp: {data.get('choices')}. Full: {data}"); return "Sorry, could not parse AI response (choices)."
|
398 |
except json.JSONDecodeError: logger.error(f"Failed JSON decode OpenRouter. Status:{response.status_code}. Resp:{response.text[:500]}"); return "Sorry, failed to understand AI response."
|
399 |
except Exception as e: logger.error(f"Error processing OpenRouter success response: {e}", exc_info=True); return "Sorry, error processing AI response."
|
400 |
-
# Error handling matching Colab script
|
401 |
elif response.status_code == 401: logger.error("OpenRouter API key invalid (401)."); return "Error: AI model configuration key is invalid."
|
402 |
elif response.status_code == 402: logger.error("OpenRouter Payment Required (402)."); return "Sorry, AI service limits/payment issue."
|
403 |
elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Exceeded (429)."); return "Sorry, AI model is busy. Try again."
|
@@ -407,122 +310,78 @@ async def generate_summary(text: str, summary_type: str) -> str:
|
|
407 |
except httpx.RequestError as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, error connecting to AI service."
|
408 |
except Exception as e: logger.error(f"Unexpected error in generate_summary (OpenRouter): {e}", exc_info=True); return "Sorry, unexpected error generating summary."
|
409 |
|
410 |
-
# --- Background Task Processing
|
411 |
async def process_summary_task(
|
412 |
user_id: int, chat_id: int, message_id_to_edit: Optional[int],
|
413 |
-
url: str, summary_type: str, bot_token: str
|
414 |
) -> None:
|
415 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"
|
416 |
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
417 |
-
background_request: Optional[BaseRequest] = None
|
418 |
-
|
419 |
-
try: # Create background bot instance
|
420 |
-
background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 )
|
421 |
-
bot = Bot(token=bot_token, request=background_request)
|
422 |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return
|
423 |
-
|
424 |
content = None; user_feedback_message = None; success = False
|
425 |
-
status_message_id = message_id_to_edit
|
426 |
-
message_to_delete_later_id : Optional[int] = None # Track ID of new status message
|
427 |
-
|
428 |
try:
|
429 |
-
# --- Inform User Processing Started ---
|
430 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nThis might take a moment..."
|
431 |
if status_message_id:
|
432 |
-
try:
|
433 |
-
|
434 |
-
|
435 |
-
logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
436 |
-
except Exception as e:
|
437 |
-
logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new.")
|
438 |
-
status_message_id = None # Will trigger sending new message
|
439 |
-
if not status_message_id: # Send new status message if needed
|
440 |
try:
|
441 |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN )
|
442 |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
|
443 |
else: raise RuntimeError("Failed to send status message after retries.")
|
444 |
-
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise
|
445 |
-
|
446 |
-
# --- Main Fetching & Summarization ---
|
447 |
try:
|
448 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
449 |
-
is_youtube = is_youtube_url(url)
|
450 |
-
logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
|
451 |
-
|
452 |
if is_youtube:
|
453 |
video_id = extract_youtube_id(url)
|
454 |
-
if video_id: content = await get_youtube_transcript(video_id, url)
|
455 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
456 |
-
if not content and not user_feedback_message:
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
if not content: # Try fallback
|
461 |
logger.warning(f"[Task {task_id}] Primary web scrape failed for {url}. Trying fallback API.")
|
462 |
-
global URLTOTEXT_API_KEY
|
463 |
if URLTOTEXT_API_KEY:
|
464 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
465 |
-
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
466 |
if not content: user_feedback_message = "Sorry, I couldn't fetch content from that website using either method (blocked/inaccessible/empty?)."
|
467 |
-
else:
|
468 |
-
user_feedback_message = "Sorry, I couldn't fetch content from that website (blocked/inaccessible/empty?). The fallback method is not configured."
|
469 |
-
|
470 |
-
# --- Generate Summary ---
|
471 |
if content:
|
472 |
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.")
|
473 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
474 |
-
# Use new generate_summary function (keys accessed globally within it)
|
475 |
final_summary = await generate_summary(content, summary_type)
|
476 |
-
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
477 |
-
user_feedback_message = final_summary # Use error from summary func
|
478 |
-
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
479 |
else:
|
480 |
-
|
481 |
-
|
482 |
-
summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
|
483 |
-
# Send first/only part
|
484 |
-
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0],
|
485 |
-
parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True} )
|
486 |
-
# Send subsequent parts
|
487 |
for part in summary_parts[1:]: await asyncio.sleep(0.5); await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True} )
|
488 |
-
success = True; logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts).")
|
489 |
-
|
490 |
-
|
491 |
-
# --- Send Feedback if Fetching or Summary Failed ---
|
492 |
-
elif user_feedback_message: # Only send if content failed AND message exists
|
493 |
-
logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}")
|
494 |
-
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
|
495 |
-
|
496 |
except Exception as e:
|
497 |
-
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True)
|
498 |
-
user_feedback_message = "Oops! Something went really wrong while processing your request. Please try again later."
|
499 |
-
# Ensure we send this feedback if an unexpected exception occurs
|
500 |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
|
501 |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.")
|
502 |
-
|
503 |
-
except Exception as outer_e: # Catch critical errors like failing to send status message
|
504 |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True)
|
505 |
try:
|
506 |
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred." )
|
507 |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.")
|
508 |
-
|
509 |
finally:
|
510 |
-
# --- Cleanup ---
|
511 |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
|
512 |
if delete_target_id and bot:
|
513 |
-
try:
|
514 |
-
# Delete the original button message OR the status message we sent
|
515 |
-
await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id)
|
516 |
-
logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}")
|
517 |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}")
|
518 |
-
# Close background bot's httpx client
|
519 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
520 |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
|
521 |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
|
522 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
523 |
|
524 |
-
|
525 |
-
# --- Telegram Bot Handlers (Unchanged structure, Colab text/logic adjusted) ---
|
526 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
527 |
user = update.effective_user; mention = user.mention_html()
|
528 |
if not user or not update.message: return
|
@@ -533,105 +392,72 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
533 |
user = update.effective_user
|
534 |
if not user or not update.message: return
|
535 |
logger.info(f"User {user.id} used /help.")
|
536 |
-
|
537 |
-
help_text = ( "🔍 How to use this bot:\n\n"
|
538 |
-
"1. Send me any YouTube video link or website URL.\n"
|
539 |
-
"2. I'll ask you how you want it summarized (paragraph or points).\n"
|
540 |
-
"3. Click the button for your choice.\n"
|
541 |
-
"4. Wait for the summary!\n\n"
|
542 |
-
"I'll try multiple methods to get content if the first one fails (especially for YouTube transcripts).\n\n"
|
543 |
-
"Commands:\n"
|
544 |
-
"`/start` - Display welcome message\n"
|
545 |
-
"`/help` - Show this help message" )
|
546 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
547 |
|
548 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
549 |
if not update.message or not update.message.text: return
|
550 |
url = update.message.text.strip(); user = update.effective_user
|
551 |
if not user: return
|
552 |
-
|
553 |
-
if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
|
554 |
-
logger.debug(f"Ignoring non-URL from {user.id}: {url}"); return
|
555 |
logger.info(f"User {user.id} sent potential URL: {url}")
|
556 |
-
context.user_data['url_to_summarize'] = url
|
557 |
-
context.user_data['original_message_id'] = update.message.message_id # Still useful potentially
|
558 |
-
# Keyboard text from Colab script
|
559 |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]]
|
560 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
561 |
-
|
562 |
-
await update.message.reply_text( f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?",
|
563 |
-
reply_markup=reply_markup, disable_web_page_preview=True )
|
564 |
|
565 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
566 |
-
"""Handles button press, retrieves URL, and schedules background task."""
|
567 |
query = update.callback_query
|
568 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
569 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
570 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}")
|
571 |
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True)
|
572 |
-
|
573 |
-
url = context.user_data.get('url_to_summarize')
|
574 |
-
message_id_to_edit = query.message.message_id # Use the message with the buttons
|
575 |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}")
|
576 |
-
|
577 |
if not url:
|
578 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}).")
|
579 |
-
try:
|
580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
581 |
return
|
582 |
|
583 |
-
|
584 |
-
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None)
|
585 |
-
logger.debug(f"Cleared URL context for user {user.id}")
|
586 |
-
|
587 |
-
# Check essential keys needed for the task *before* scheduling
|
588 |
global TELEGRAM_TOKEN, OPENROUTER_API_KEY
|
589 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing!"); try: await query.edit_message_text(text="❌ Bot config error.") except Exception: pass; return
|
590 |
if not OPENROUTER_API_KEY: logger.error("OpenRouter key missing!"); try: await query.edit_message_text(text="❌ AI config error.") except Exception: pass; return
|
591 |
-
|
592 |
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}")
|
593 |
-
|
594 |
-
asyncio.create_task( process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=message_id_to_edit,
|
595 |
-
url=url, summary_type=summary_type, bot_token=TELEGRAM_TOKEN ), name=f"SummaryTask-{user.id}-{message_id_to_edit}" )
|
596 |
|
597 |
-
# --- Error Handler, Bot Setup, Lifespan, Routes (Largely Unchanged, Ensure Keys Read) ---
|
598 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
599 |
-
|
600 |
-
|
601 |
-
ignore_errors = (AttributeError, ) # Example: Ignore cleanup errors if handled in finally blocks
|
602 |
-
if isinstance(context.error, ignore_errors) and "object has no attribute" in str(context.error): # Be more specific
|
603 |
-
logger.debug(f"Ignoring known/handled error in error_handler: {context.error}")
|
604 |
-
return
|
605 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
606 |
|
|
|
607 |
async def setup_bot_config() -> Application:
|
608 |
-
"
|
609 |
-
logger.info("Configuring Telegram Application...")
|
610 |
-
global TELEGRAM_TOKEN # Ensure global token is accessible
|
611 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
612 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
613 |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build()
|
614 |
-
application.add_handler(CommandHandler("start", start))
|
615 |
-
application.add_handler(
|
616 |
-
application.
|
617 |
-
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
618 |
-
application.add_error_handler(error_handler)
|
619 |
-
logger.info("Telegram application handlers configured.")
|
620 |
-
return application
|
621 |
|
|
|
622 |
@contextlib.asynccontextmanager
|
623 |
async def lifespan(app: Starlette):
|
624 |
-
|
625 |
-
|
626 |
-
logger.info("ASGI Lifespan: Startup initiated...")
|
627 |
-
# Essential key check already happened globally, but double-check token
|
628 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
|
629 |
try:
|
630 |
-
ptb_app = await setup_bot_config()
|
631 |
-
await ptb_app.initialize()
|
632 |
-
bot_info = await ptb_app.bot.get_me()
|
633 |
-
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
|
634 |
-
# --- Webhook setup (unchanged from previous version) ---
|
635 |
current_webhook_info = await ptb_app.bot.get_webhook_info()
|
636 |
if current_webhook_info and current_webhook_info.url:
|
637 |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
|
@@ -639,81 +465,64 @@ async def lifespan(app: Starlette):
|
|
639 |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
|
640 |
else: logger.warning("Failed delete webhook (API returned False).")
|
641 |
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1)
|
642 |
-
space_host = os.environ.get("SPACE_HOST")
|
643 |
-
webhook_path = "/webhook"; full_webhook_url = None
|
644 |
if space_host:
|
645 |
-
protocol = "https://"; host = space_host.split('://')[-1]
|
646 |
-
full_webhook_url = f"{protocol}{host.rstrip('/')}{webhook_path}"
|
647 |
if full_webhook_url:
|
648 |
-
logger.info(f"Setting webhook: {full_webhook_url}")
|
649 |
-
set_webhook_args = { "url": full_webhook_url, "allowed_updates": Update.ALL_TYPES, "drop_pending_updates": True }
|
650 |
if WEBHOOK_SECRET: set_webhook_args["secret_token"] = WEBHOOK_SECRET; logger.info("Using webhook secret.")
|
651 |
await asyncio.sleep(1.0)
|
652 |
try:
|
653 |
-
await ptb_app.bot.set_webhook(**set_webhook_args)
|
654 |
-
webhook_info = await ptb_app.bot.get_webhook_info()
|
655 |
if webhook_info.url == full_webhook_url: logger.info(f"Webhook set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
|
656 |
else: logger.error(f"Webhook URL mismatch! Expected '{full_webhook_url}', Got '{webhook_info.url}'"); raise RuntimeError("Webhook URL mismatch.")
|
657 |
-
await ptb_app.start()
|
658 |
-
logger.info("PTB Application started (webhook mode).")
|
659 |
except Exception as e: logger.error(f"FATAL: Failed set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed set webhook: {e}") from e
|
660 |
else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL undetermined.")
|
661 |
else: logger.critical("SPACE_HOST missing."); raise RuntimeError("SPACE_HOST env var missing.")
|
662 |
-
|
663 |
-
logger.info("ASGI Lifespan: Startup complete.")
|
664 |
-
yield # App runs
|
665 |
except Exception as startup_err:
|
666 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
667 |
if ptb_app:
|
668 |
if ptb_app.running: await ptb_app.stop()
|
669 |
await ptb_app.shutdown()
|
670 |
raise
|
671 |
-
finally:
|
672 |
logger.info("ASGI Lifespan: Shutdown initiated...")
|
673 |
if ptb_app:
|
674 |
if ptb_app.running: logger.info("Stopping PTB..."); await ptb_app.stop()
|
675 |
-
logger.info("Shutting down PTB..."); await ptb_app.shutdown()
|
676 |
-
logger.info("PTB Application shut down.")
|
677 |
else: logger.info("PTB application not initialized or failed.")
|
678 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
679 |
|
|
|
680 |
async def health_check(request: Request) -> PlainTextResponse:
|
681 |
-
|
682 |
-
global OPENROUTER_MODEL, APIFY_ACTOR_ID, _apify_token_exists
|
683 |
-
bot_status = "Not Initialized"
|
684 |
if ptb_app and ptb_app.bot:
|
685 |
try:
|
686 |
if ptb_app.running: bot_info = await ptb_app.bot.get_me(); bot_status = f"Running (@{bot_info.username})"
|
687 |
else: bot_status = "Initialized/Not running"
|
688 |
except Exception as e: bot_status = f"Error checking status: {e}"
|
689 |
-
# Include model/actor info in health check
|
690 |
return PlainTextResponse(f"TG Bot Summarizer - Status: {bot_status}\nModel: {OPENROUTER_MODEL}\nApify Actor: {APIFY_ACTOR_ID if _apify_token_exists else 'N/A (No Token)'}")
|
691 |
|
692 |
async def telegram_webhook(request: Request) -> Response:
|
693 |
-
|
694 |
-
global WEBHOOK_SECRET # Access global
|
695 |
if not ptb_app: logger.error("Webhook recv but PTB not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
|
696 |
if not ptb_app.running: logger.warning("Webhook recv but PTB not running."); return PlainTextResponse('Bot not running', status_code=503)
|
697 |
try:
|
698 |
-
# Secret check (unchanged)
|
699 |
if WEBHOOK_SECRET:
|
700 |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
701 |
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook invalid secret. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
|
702 |
-
update_data = await request.json()
|
703 |
-
update
|
704 |
-
logger.debug(f"Processing update_id: {update.update_id} via webhook")
|
705 |
-
await ptb_app.process_update(update)
|
706 |
-
return Response(status_code=200) # OK
|
707 |
except json.JSONDecodeError: logger.error("Webhook invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
708 |
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK despite error
|
709 |
|
710 |
-
# --- Create Starlette ASGI Application
|
711 |
-
app = Starlette( debug=False, lifespan=lifespan, routes=[
|
712 |
-
Route("/", endpoint=health_check, methods=["GET"]),
|
713 |
-
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
|
714 |
logger.info("Starlette ASGI application created with native routes.")
|
715 |
|
716 |
-
# --- Development Server Block
|
717 |
if __name__ == '__main__':
|
718 |
import uvicorn
|
719 |
logger.warning("Running in development mode using Uvicorn directly")
|
|
|
1 |
+
# main.py (Correcting SyntaxError at line 580)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
|
|
30 |
from telegram.request import HTTPXRequest, BaseRequest
|
31 |
|
32 |
# --- Other Libraries ---
|
33 |
+
import httpx
|
34 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
35 |
from bs4 import BeautifulSoup
|
36 |
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log
|
|
|
40 |
except ImportError:
|
41 |
DEFAULT_PARSER = 'html.parser'
|
42 |
|
|
|
|
|
43 |
# --- Logging Setup ---
|
44 |
logging.basicConfig(
|
45 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
46 |
level=logging.INFO
|
47 |
)
|
48 |
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
|
49 |
logging.getLogger("telegram.ext").setLevel(logging.INFO)
|
50 |
logging.getLogger('telegram.bot').setLevel(logging.INFO)
|
51 |
logging.getLogger("urllib3").setLevel(logging.INFO)
|
|
|
69 |
|
70 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
71 |
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY')
|
72 |
+
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
|
73 |
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
74 |
+
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
75 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
76 |
|
|
|
77 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
|
78 |
+
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo/youtube-transcripts")
|
79 |
+
|
80 |
+
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
81 |
+
if not OPENROUTER_API_KEY: logger.error("❌ ERROR: OPENROUTER_API_KEY not found. Summarization will fail.")
|
82 |
+
|
83 |
+
if not URLTOTEXT_API_KEY: logger.warning("⚠️ WARNING: URLTOTEXT_API_KEY not found.")
|
84 |
+
if not SUPADATA_API_KEY: logger.warning("⚠️ WARNING: SUPADATA_API_KEY not found.")
|
85 |
+
if not APIFY_API_TOKEN: logger.warning("⚠️ WARNING: APIFY_API_TOKEN not found.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
logger.info("Secret loading and configuration check finished.")
|
88 |
logger.info(f"Using OpenRouter Model: {OPENROUTER_MODEL}")
|
89 |
logger.info(f"Using Apify Actor (via REST): {APIFY_ACTOR_ID}")
|
90 |
|
91 |
+
_apify_token_exists = bool(APIFY_API_TOKEN)
|
92 |
|
93 |
+
# --- Retry Decorator ---
|
94 |
@retry( stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=15),
|
95 |
retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)),
|
96 |
before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True )
|
|
|
103 |
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise
|
104 |
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise
|
105 |
|
106 |
+
# --- Helper Functions ---
|
107 |
def is_youtube_url(url):
|
108 |
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
|
109 |
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match)
|
|
|
113 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
114 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
115 |
|
116 |
+
# --- Content Fetching Functions ---
|
|
|
|
|
117 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
118 |
+
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
|
|
|
|
|
|
|
|
|
|
119 |
try:
|
120 |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
121 |
logger.debug(f"[Web Scrape] Sending request to {url}")
|
|
|
123 |
logger.debug(f"[Web Scrape] Received response {response.status_code} from {url}")
|
124 |
response.raise_for_status()
|
125 |
content_type = response.headers.get('content-type', '').lower()
|
126 |
+
if 'html' not in content_type: logger.warning(f"[Web Scrape] Non-HTML content type from {url}: {content_type}"); return None
|
127 |
+
try: return response.text
|
|
|
|
|
128 |
except Exception as e: logger.error(f"[Web Scrape] Error decoding response for {url}: {e}"); return None
|
129 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape] HTTP error {e.response.status_code} fetching {url}: {e}")
|
130 |
except httpx.TimeoutException: logger.error(f"[Web Scrape] Timeout error fetching {url}")
|
131 |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape] Too many redirects fetching {url}")
|
132 |
+
except httpx.RequestError as e: logger.error(f"[Web Scrape] Request error fetching {url}: {e}")
|
133 |
except Exception as e: logger.error(f"[Web Scrape] Unexpected error fetching {url}: {e}", exc_info=True)
|
134 |
return None
|
135 |
|
|
|
136 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
|
|
137 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
138 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
139 |
logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
|
|
|
140 |
api_endpoint = "https://api.supadata.net/v1/youtube/transcript"
|
141 |
+
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
|
|
|
142 |
try:
|
143 |
async with httpx.AsyncClient(timeout=30.0) as client:
|
144 |
response = await client.get(api_endpoint, headers=headers, params=params)
|
145 |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
|
146 |
if response.status_code == 200:
|
147 |
try:
|
|
|
148 |
try: data = response.json()
|
149 |
except json.JSONDecodeError: data = None
|
150 |
content = None
|
151 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
152 |
+
if not content and response.text: content = response.text
|
153 |
+
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
|
|
|
|
|
154 |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
|
155 |
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
|
156 |
+
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None
|
157 |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None
|
158 |
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None
|
159 |
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None
|
160 |
+
except httpx.RequestError as e: logger.error(f"[Supadata] Request error for {video_id}: {e}"); return None
|
161 |
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
|
162 |
|
163 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
164 |
+
global APIFY_ACTOR_ID
|
|
|
165 |
if not video_url: logger.error("[Apify] No video_url provided"); return None
|
166 |
if not api_token: logger.error("[Apify] API token missing."); return None
|
167 |
logger.info(f"[Apify] Attempting fetch via REST for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
|
168 |
api_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
|
169 |
params = {"token": api_token}
|
170 |
+
payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
headers = {"Content-Type": "application/json"}
|
172 |
try:
|
173 |
+
async with httpx.AsyncClient(timeout=90.0) as client:
|
174 |
logger.debug(f"[Apify] Sending request to run actor {APIFY_ACTOR_ID} synchronously for {video_url}")
|
175 |
+
response = await client.post(api_endpoint, headers=headers, params=params, json=payload)
|
176 |
logger.debug(f"[Apify] Received status code {response.status_code} for {video_url}")
|
177 |
if response.status_code == 200:
|
178 |
try:
|
179 |
results = response.json()
|
180 |
if isinstance(results, list) and len(results) > 0:
|
181 |
item = results[0]
|
|
|
182 |
content = item.get("text") or item.get("transcript") or item.get("captions_concatenated")
|
183 |
+
if not content and item.get("captions") and isinstance(item["captions"], list): logger.info("[Apify] Processing 'captions' format."); content = " ".join(cap.get("text", "") for cap in item["captions"] if cap.get("text"))
|
184 |
+
if content and isinstance(content, str): logger.info(f"[Apify] Success via REST for {video_url}. Length: {len(content)}"); return content.strip()
|
|
|
|
|
|
|
|
|
185 |
else: logger.warning(f"[Apify] Actor success but transcript empty/not found for {video_url}. Item: {item}"); return None
|
186 |
else: logger.warning(f"[Apify] Actor success but dataset empty for {video_url}. Response: {results}"); return None
|
187 |
except json.JSONDecodeError: logger.error(f"[Apify] Failed JSON decode for {video_url}. Status:{response.status_code}. Resp:{response.text[:200]}"); return None
|
188 |
except Exception as e: logger.error(f"[Apify] Error processing success response for {video_url}: {e}", exc_info=True); return None
|
189 |
elif response.status_code == 400: logger.error(f"[Apify] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None
|
190 |
+
elif response.status_code == 401: logger.error("[Apify] Auth error (401). Check token."); return None
|
191 |
else: logger.error(f"[Apify] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None
|
192 |
except httpx.TimeoutException: logger.error(f"[Apify] Timeout running actor for {video_url}"); return None
|
193 |
except httpx.RequestError as e: logger.error(f"[Apify] Request error running actor for {video_url}: {e}"); return None
|
194 |
except Exception as e: logger.error(f"[Apify] Unexpected error during Apify REST call for {video_url}: {e}", exc_info=True); return None
|
195 |
|
196 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
197 |
+
global SUPADATA_API_KEY, APIFY_API_TOKEN
|
|
|
198 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
199 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
200 |
transcript_text = None
|
|
|
|
|
201 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
202 |
try:
|
203 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
|
|
209 |
if "No transcript found" in str(e): logger.warning(f"[Primary YT] No transcript found.")
|
210 |
elif "disabled" in str(e): logger.warning(f"[Primary YT] Transcripts disabled.")
|
211 |
transcript_text = None
|
|
|
|
|
212 |
if transcript_text is None:
|
213 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
214 |
if SUPADATA_API_KEY:
|
|
|
216 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
217 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
218 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
|
|
|
|
219 |
if transcript_text is None:
|
220 |
logger.info("[Fallback YT 2] Trying Apify REST API...")
|
221 |
if APIFY_API_TOKEN:
|
|
|
223 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify REST for {video_url}"); return transcript_text
|
224 |
else: logger.warning(f"[Fallback YT 2] Apify REST failed or no content for {video_url}.")
|
225 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
|
|
226 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
227 |
+
return transcript_text
|
228 |
|
|
|
229 |
async def get_website_content(url: str) -> Optional[str]:
|
|
|
230 |
if not url: logger.error("get_website_content: No URL"); return None
|
231 |
logger.info(f"[Primary Web] Fetching website content for: {url}")
|
232 |
html_content = await fetch_url_content_for_scrape(url)
|
233 |
if not html_content: return None
|
234 |
try:
|
235 |
def parse_html(content):
|
|
|
236 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
237 |
+
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure"]): element.extract()
|
|
|
|
|
|
|
238 |
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(role='main')
|
239 |
target_element = main_content if main_content else soup.body
|
240 |
if not target_element: logger.warning(f"[Primary Web] Could not find body/main for parsing {url}"); return None
|
|
|
242 |
text = " ".join(lines)
|
243 |
if not text: logger.warning(f"[Primary Web] Extracted text empty after clean for {url}"); return None
|
244 |
return text
|
|
|
245 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
246 |
if text_content: logger.info(f"[Primary Web] Success scrape for {url} (final len: {len(text_content)})"); return text_content
|
247 |
else: return None
|
248 |
except Exception as e: logger.error(f"[Primary Web] Error scraping/parsing {url}: {e}", exc_info=True); return None
|
249 |
|
250 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
|
251 |
if not url: logger.error("[Fallback Web API] No URL"); return None
|
252 |
if not api_key: logger.error("[Fallback Web API] urltotext.com API key missing."); return None
|
253 |
logger.info(f"[Fallback Web API] Attempting fetch for: {url} using urltotext.com API")
|
|
|
254 |
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
|
255 |
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False }
|
256 |
+
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
|
257 |
try:
|
258 |
+
async with httpx.AsyncClient(timeout=45.0) as client:
|
259 |
logger.debug(f"[Fallback Web API] Sending request to urltotext.com API for {url}")
|
260 |
response = await client.post(api_endpoint, headers=headers, json=payload)
|
261 |
logger.debug(f"[Fallback Web API] Received status {response.status_code} from urltotext.com API for {url}")
|
262 |
if response.status_code == 200:
|
263 |
try:
|
264 |
data = response.json()
|
265 |
+
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning")
|
|
|
|
|
266 |
if warning: logger.warning(f"[Fallback Web API] urltotext.com API Warning for {url}: {warning}")
|
267 |
if content: logger.info(f"[Fallback Web API] Success via urltotext.com API for {url}. Len: {len(content)}. Credits: {credits}"); return content.strip()
|
268 |
else: logger.warning(f"[Fallback Web API] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
|
269 |
except json.JSONDecodeError: logger.error(f"[Fallback Web API] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
|
270 |
except Exception as e: logger.error(f"[Fallback Web API] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
|
|
|
271 |
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Fallback Web API] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
272 |
else: logger.error(f"[Fallback Web API] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
273 |
except httpx.TimeoutException: logger.error(f"[Fallback Web API] Timeout connecting to urltotext.com API for {url}"); return None
|
274 |
except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
|
275 |
except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
276 |
|
277 |
+
# --- Summarization Function ---
|
278 |
async def generate_summary(text: str, summary_type: str) -> str:
|
279 |
+
global OPENROUTER_API_KEY, OPENROUTER_MODEL
|
|
|
280 |
logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
|
281 |
if not OPENROUTER_API_KEY: logger.error("OpenRouter key missing for generate_summary."); return "Error: AI model configuration key missing."
|
282 |
+
if summary_type == "paragraph": prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n" "• Clear and simple language suitable for someone unfamiliar with the topic.\n" "• Uses British English spellings throughout.\n" "• Straightforward and understandable vocabulary; avoid complex terms.\n" "• Presented as ONE SINGLE PARAGRAPH.\n" "• No more than 85 words maximum; but does not have to be exactly 85.\n" "• Considers the entire text content equally.\n" "• Uses semicolons (;) instead of em dashes (– or —).\n\n" "Here is the text to summarise:")
|
283 |
+
else: prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this Markdown format:\n\n" "• For each distinct topic or section identified in the text, create a heading.\n" "• Each heading MUST be enclosed in double asterisks for bolding (e.g., **Section Title**).\n" "• Immediately following each heading, list the key points as a bulleted list.\n" "• Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n" "• The text within each bullet point should NOT contain any bold formatting.\n" "• Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n" "• Use British English spellings throughout.\n" "• Avoid overly complex or advanced vocabulary.\n" "• Keep bullet points concise.\n" "• Ensure the entire summary takes no more than two minutes to read.\n" "• Consider the entire text's content, not just the beginning or a few topics.\n" "• Use semicolons (;) instead of em dashes (– or —).\n\n" "Here is the text to summarise:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
MAX_INPUT_LENGTH = 500000
|
285 |
+
if len(text) > MAX_INPUT_LENGTH: logger.warning(f"Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH}). Truncating."); text = text[:MAX_INPUT_LENGTH] + "... (Content truncated)"
|
|
|
|
|
286 |
full_prompt = f"{prompt}\n\n{text}"
|
287 |
+
headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" }; payload = { "model": OPENROUTER_MODEL, "messages": [{"role": "user", "content": full_prompt}] }; openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
|
|
|
|
|
|
|
|
288 |
try:
|
289 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
290 |
+
logger.debug(f"Sending request to OpenRouter ({OPENROUTER_MODEL})..."); response = await client.post(openrouter_api_endpoint, headers=headers, json=payload); logger.debug(f"Received status {response.status_code} from OpenRouter.")
|
|
|
|
|
291 |
if response.status_code == 200:
|
292 |
try:
|
293 |
data = response.json()
|
|
|
301 |
else: logger.error(f"Unexpected choices structure in OpenRouter resp: {data.get('choices')}. Full: {data}"); return "Sorry, could not parse AI response (choices)."
|
302 |
except json.JSONDecodeError: logger.error(f"Failed JSON decode OpenRouter. Status:{response.status_code}. Resp:{response.text[:500]}"); return "Sorry, failed to understand AI response."
|
303 |
except Exception as e: logger.error(f"Error processing OpenRouter success response: {e}", exc_info=True); return "Sorry, error processing AI response."
|
|
|
304 |
elif response.status_code == 401: logger.error("OpenRouter API key invalid (401)."); return "Error: AI model configuration key is invalid."
|
305 |
elif response.status_code == 402: logger.error("OpenRouter Payment Required (402)."); return "Sorry, AI service limits/payment issue."
|
306 |
elif response.status_code == 429: logger.warning("OpenRouter Rate Limit Exceeded (429)."); return "Sorry, AI model is busy. Try again."
|
|
|
310 |
except httpx.RequestError as e: logger.error(f"Request error connecting to OpenRouter API: {e}"); return "Sorry, error connecting to AI service."
|
311 |
except Exception as e: logger.error(f"Unexpected error in generate_summary (OpenRouter): {e}", exc_info=True); return "Sorry, unexpected error generating summary."
|
312 |
|
313 |
+
# --- Background Task Processing ---
|
314 |
async def process_summary_task(
|
315 |
user_id: int, chat_id: int, message_id_to_edit: Optional[int],
|
316 |
+
url: str, summary_type: str, bot_token: str
|
317 |
) -> None:
|
318 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"
|
319 |
logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
320 |
+
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None
|
321 |
+
try: background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=60.0, write_timeout=60.0, pool_timeout=60.0 ); bot = Bot(token=bot_token, request=background_request)
|
|
|
|
|
|
|
322 |
except Exception as e: logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True); return
|
|
|
323 |
content = None; user_feedback_message = None; success = False
|
324 |
+
status_message_id = message_id_to_edit; message_to_delete_later_id : Optional[int] = None
|
|
|
|
|
325 |
try:
|
|
|
326 |
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nThis might take a moment..."
|
327 |
if status_message_id:
|
328 |
+
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None ); logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
329 |
+
except Exception as e: logger.warning(f"[Task {task_id}] Could not edit original message {status_message_id}: {e}. Sending new."); status_message_id = None
|
330 |
+
if not status_message_id:
|
|
|
|
|
|
|
|
|
|
|
331 |
try:
|
332 |
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN )
|
333 |
if status_message: message_to_delete_later_id = status_message.message_id; logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
|
334 |
else: raise RuntimeError("Failed to send status message after retries.")
|
335 |
+
except Exception as e: logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}"); raise
|
|
|
|
|
336 |
try:
|
337 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
338 |
+
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
|
|
|
|
|
339 |
if is_youtube:
|
340 |
video_id = extract_youtube_id(url)
|
341 |
+
if video_id: content = await get_youtube_transcript(video_id, url)
|
342 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
343 |
+
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
344 |
+
else:
|
345 |
+
content = await get_website_content(url)
|
346 |
+
if not content:
|
|
|
347 |
logger.warning(f"[Task {task_id}] Primary web scrape failed for {url}. Trying fallback API.")
|
348 |
+
global URLTOTEXT_API_KEY
|
349 |
if URLTOTEXT_API_KEY:
|
350 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
351 |
+
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
352 |
if not content: user_feedback_message = "Sorry, I couldn't fetch content from that website using either method (blocked/inaccessible/empty?)."
|
353 |
+
else: user_feedback_message = "Sorry, I couldn't fetch content from that website (blocked/inaccessible/empty?). The fallback method is not configured."
|
|
|
|
|
|
|
354 |
if content:
|
355 |
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary.")
|
356 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
|
|
357 |
final_summary = await generate_summary(content, summary_type)
|
358 |
+
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): user_feedback_message = final_summary; logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
|
|
|
|
359 |
else:
|
360 |
+
max_length = 4096; summary_parts = [final_summary[i:i+max_length] for i in range(0, len(final_summary), max_length)]
|
361 |
+
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=summary_parts[0], parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True} )
|
|
|
|
|
|
|
|
|
|
|
362 |
for part in summary_parts[1:]: await asyncio.sleep(0.5); await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=ParseMode.MARKDOWN, link_preview_options={'is_disabled': True} )
|
363 |
+
success = True; logger.info(f"[Task {task_id}] Successfully sent summary ({len(summary_parts)} parts)."); user_feedback_message = None
|
364 |
+
elif user_feedback_message: logger.warning(f"[Task {task_id}] Sending failure feedback: {user_feedback_message}"); await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
except Exception as e:
|
366 |
+
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True); user_feedback_message = "Oops! Something went really wrong. Please try again later."
|
|
|
|
|
367 |
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
|
368 |
except Exception: logger.error(f"[Task {task_id}] Failed to send unexpected error feedback.")
|
369 |
+
except Exception as outer_e:
|
|
|
370 |
logger.critical(f"[Task {task_id}] Critical outer error: {outer_e}", exc_info=True)
|
371 |
try:
|
372 |
if bot: await retry_bot_operation( bot.send_message, chat_id=chat_id, text="❌ Critical internal error occurred." )
|
373 |
except Exception: logger.exception(f"[Task {task_id}] Failed even to send critical error message.")
|
|
|
374 |
finally:
|
|
|
375 |
delete_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
|
376 |
if delete_target_id and bot:
|
377 |
+
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id); logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}")
|
|
|
|
|
|
|
378 |
except Exception as del_e: logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}")
|
|
|
379 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
380 |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
|
381 |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
|
382 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
383 |
|
384 |
+
# --- Telegram Bot Handlers ---
|
|
|
385 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
386 |
user = update.effective_user; mention = user.mention_html()
|
387 |
if not user or not update.message: return
|
|
|
392 |
user = update.effective_user
|
393 |
if not user or not update.message: return
|
394 |
logger.info(f"User {user.id} used /help.")
|
395 |
+
help_text = ( "🔍 How to use this bot:\n\n" "1. Send me any YouTube video link or website URL.\n" "2. I'll ask you how you want it summarized (paragraph or points).\n" "3. Click the button for your choice.\n" "4. Wait for the summary!\n\n" "I'll try multiple methods to get content if the first one fails (especially for YouTube transcripts).\n\n" "Commands:\n" "`/start` - Display welcome message\n" "`/help` - Show this help message" )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
397 |
|
398 |
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
399 |
if not update.message or not update.message.text: return
|
400 |
url = update.message.text.strip(); user = update.effective_user
|
401 |
if not user: return
|
402 |
+
if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]: logger.debug(f"Ignoring non-URL from {user.id}: {url}"); return
|
|
|
|
|
403 |
logger.info(f"User {user.id} sent potential URL: {url}")
|
404 |
+
context.user_data['url_to_summarize'] = url; context.user_data['original_message_id'] = update.message.message_id
|
|
|
|
|
405 |
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]]
|
406 |
reply_markup = InlineKeyboardMarkup(keyboard)
|
407 |
+
await update.message.reply_text( f"Okay, I see this link:\n{url}\n\nHow would you like it summarized?", reply_markup=reply_markup, disable_web_page_preview=True )
|
|
|
|
|
408 |
|
409 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
|
|
410 |
query = update.callback_query
|
411 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
412 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
413 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id}")
|
414 |
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True)
|
415 |
+
url = context.user_data.get('url_to_summarize'); message_id_to_edit = query.message.message_id
|
|
|
|
|
416 |
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}")
|
|
|
417 |
if not url:
|
418 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}).")
|
419 |
+
try:
|
420 |
+
await query.edit_message_text(text="Sorry, I couldn't find the URL associated with this request. Please send the link again.")
|
421 |
+
# *** FIX: Correct syntax for nested try/except ***
|
422 |
+
except Exception as e:
|
423 |
+
logger.error(f"Failed edit 'URL not found' msg: {e}")
|
424 |
+
# Put the fallback try block on a new indented line
|
425 |
+
try:
|
426 |
+
await context.bot.send_message(chat_id=user.id, text="Sorry, context lost. Send link again.")
|
427 |
+
except Exception:
|
428 |
+
pass # Ignore if sending fallback also fails
|
429 |
return
|
430 |
|
431 |
+
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
|
|
|
|
|
|
|
|
|
432 |
global TELEGRAM_TOKEN, OPENROUTER_API_KEY
|
433 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing!"); try: await query.edit_message_text(text="❌ Bot config error.") except Exception: pass; return
|
434 |
if not OPENROUTER_API_KEY: logger.error("OpenRouter key missing!"); try: await query.edit_message_text(text="❌ AI config error.") except Exception: pass; return
|
|
|
435 |
logger.info(f"Scheduling task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}")
|
436 |
+
asyncio.create_task( process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=message_id_to_edit, url=url, summary_type=summary_type, bot_token=TELEGRAM_TOKEN ), name=f"SummaryTask-{user.id}-{message_id_to_edit}" )
|
|
|
|
|
437 |
|
|
|
438 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
439 |
+
ignore_errors = (AttributeError, )
|
440 |
+
if isinstance(context.error, ignore_errors) and "object has no attribute" in str(context.error): logger.debug(f"Ignoring known/handled error in error_handler: {context.error}"); return
|
|
|
|
|
|
|
|
|
441 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
442 |
|
443 |
+
# --- Bot Setup ---
|
444 |
async def setup_bot_config() -> Application:
|
445 |
+
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
|
|
|
|
446 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
447 |
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
|
448 |
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build()
|
449 |
+
application.add_handler(CommandHandler("start", start)); application.add_handler(CommandHandler("help", help_command))
|
450 |
+
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_potential_url)); application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
451 |
+
application.add_error_handler(error_handler); logger.info("Telegram application handlers configured."); return application
|
|
|
|
|
|
|
|
|
452 |
|
453 |
+
# --- ASGI Lifespan ---
|
454 |
@contextlib.asynccontextmanager
|
455 |
async def lifespan(app: Starlette):
|
456 |
+
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
|
457 |
+
logger.info("ASGI Lifespan: Startup initiated...");
|
|
|
|
|
458 |
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
|
459 |
try:
|
460 |
+
ptb_app = await setup_bot_config(); await ptb_app.initialize(); bot_info = await ptb_app.bot.get_me(); logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
|
|
|
|
|
|
|
|
|
461 |
current_webhook_info = await ptb_app.bot.get_webhook_info()
|
462 |
if current_webhook_info and current_webhook_info.url:
|
463 |
logger.info(f"Found existing webhook: {current_webhook_info.url}. Deleting...")
|
|
|
465 |
if await ptb_app.bot.delete_webhook(drop_pending_updates=True): logger.info("Webhook deleted.")
|
466 |
else: logger.warning("Failed delete webhook (API returned False).")
|
467 |
except Exception as e: logger.warning(f"Could not delete webhook: {e}"); await asyncio.sleep(1)
|
468 |
+
space_host = os.environ.get("SPACE_HOST"); webhook_path = "/webhook"; full_webhook_url = None
|
|
|
469 |
if space_host:
|
470 |
+
protocol = "https://"; host = space_host.split('://')[-1]; full_webhook_url = f"{protocol}{host.rstrip('/')}{webhook_path}"
|
|
|
471 |
if full_webhook_url:
|
472 |
+
logger.info(f"Setting webhook: {full_webhook_url}"); set_webhook_args = { "url": full_webhook_url, "allowed_updates": Update.ALL_TYPES, "drop_pending_updates": True }
|
|
|
473 |
if WEBHOOK_SECRET: set_webhook_args["secret_token"] = WEBHOOK_SECRET; logger.info("Using webhook secret.")
|
474 |
await asyncio.sleep(1.0)
|
475 |
try:
|
476 |
+
await ptb_app.bot.set_webhook(**set_webhook_args); webhook_info = await ptb_app.bot.get_webhook_info()
|
|
|
477 |
if webhook_info.url == full_webhook_url: logger.info(f"Webhook set: URL='{webhook_info.url}', Secret={bool(WEBHOOK_SECRET)}")
|
478 |
else: logger.error(f"Webhook URL mismatch! Expected '{full_webhook_url}', Got '{webhook_info.url}'"); raise RuntimeError("Webhook URL mismatch.")
|
479 |
+
await ptb_app.start(); logger.info("PTB Application started (webhook mode).")
|
|
|
480 |
except Exception as e: logger.error(f"FATAL: Failed set webhook: {e}", exc_info=True); raise RuntimeError(f"Failed set webhook: {e}") from e
|
481 |
else: logger.critical("Could not construct webhook URL."); raise RuntimeError("Webhook URL undetermined.")
|
482 |
else: logger.critical("SPACE_HOST missing."); raise RuntimeError("SPACE_HOST env var missing.")
|
483 |
+
logger.info("ASGI Lifespan: Startup complete."); yield
|
|
|
|
|
484 |
except Exception as startup_err:
|
485 |
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
|
486 |
if ptb_app:
|
487 |
if ptb_app.running: await ptb_app.stop()
|
488 |
await ptb_app.shutdown()
|
489 |
raise
|
490 |
+
finally:
|
491 |
logger.info("ASGI Lifespan: Shutdown initiated...")
|
492 |
if ptb_app:
|
493 |
if ptb_app.running: logger.info("Stopping PTB..."); await ptb_app.stop()
|
494 |
+
logger.info("Shutting down PTB..."); await ptb_app.shutdown(); logger.info("PTB Application shut down.")
|
|
|
495 |
else: logger.info("PTB application not initialized or failed.")
|
496 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
497 |
|
498 |
+
# --- Starlette Route Handlers ---
|
499 |
async def health_check(request: Request) -> PlainTextResponse:
|
500 |
+
global OPENROUTER_MODEL, APIFY_ACTOR_ID, _apify_token_exists; bot_status = "Not Initialized"
|
|
|
|
|
501 |
if ptb_app and ptb_app.bot:
|
502 |
try:
|
503 |
if ptb_app.running: bot_info = await ptb_app.bot.get_me(); bot_status = f"Running (@{bot_info.username})"
|
504 |
else: bot_status = "Initialized/Not running"
|
505 |
except Exception as e: bot_status = f"Error checking status: {e}"
|
|
|
506 |
return PlainTextResponse(f"TG Bot Summarizer - Status: {bot_status}\nModel: {OPENROUTER_MODEL}\nApify Actor: {APIFY_ACTOR_ID if _apify_token_exists else 'N/A (No Token)'}")
|
507 |
|
508 |
async def telegram_webhook(request: Request) -> Response:
|
509 |
+
global WEBHOOK_SECRET
|
|
|
510 |
if not ptb_app: logger.error("Webhook recv but PTB not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
|
511 |
if not ptb_app.running: logger.warning("Webhook recv but PTB not running."); return PlainTextResponse('Bot not running', status_code=503)
|
512 |
try:
|
|
|
513 |
if WEBHOOK_SECRET:
|
514 |
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
515 |
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook invalid secret. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
|
516 |
+
update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot); logger.debug(f"Processing update_id: {update.update_id} via webhook")
|
517 |
+
await ptb_app.process_update(update); return Response(status_code=200) # OK
|
|
|
|
|
|
|
518 |
except json.JSONDecodeError: logger.error("Webhook invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
519 |
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200) # OK despite error
|
520 |
|
521 |
+
# --- Create Starlette ASGI Application ---
|
522 |
+
app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ] )
|
|
|
|
|
523 |
logger.info("Starlette ASGI application created with native routes.")
|
524 |
|
525 |
+
# --- Development Server Block ---
|
526 |
if __name__ == '__main__':
|
527 |
import uvicorn
|
528 |
logger.warning("Running in development mode using Uvicorn directly")
|