Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -88,88 +88,76 @@ ptb_app: Optional[Application] = None
|
|
88 |
# --- Environment Variable Loading & Configuration ---
|
89 |
logger.info("Attempting to load secrets and configuration...")
|
90 |
def get_secret(secret_name):
|
91 |
-
# (Function remains the same)
|
92 |
value = os.environ.get(secret_name)
|
93 |
-
if value:
|
94 |
-
status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]
|
95 |
-
logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)")
|
96 |
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}")
|
97 |
return value
|
98 |
|
99 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
100 |
-
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY')
|
101 |
-
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
|
102 |
-
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
103 |
-
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
104 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
105 |
-
GEMINI_API_KEY = get_secret('GEMINI_API_KEY')
|
106 |
|
107 |
-
# Models
|
108 |
-
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
|
109 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
110 |
-
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001")
|
111 |
|
112 |
# --- Configuration Checks ---
|
113 |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
114 |
-
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found.
|
115 |
-
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found.
|
116 |
|
117 |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY)
|
118 |
-
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai library missing. Gemini disabled.")
|
119 |
-
elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY not found or empty. Gemini disabled.")
|
120 |
-
|
121 |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
|
122 |
-
if not _openrouter_fallback_enabled: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.")
|
123 |
-
|
124 |
_crawl4ai_primary_web_enabled = _crawl4ai_available
|
125 |
-
if not _crawl4ai_primary_web_enabled: logger.warning("⚠️ WARNING: crawl4ai library missing. Primary Web Scraper disabled.")
|
126 |
-
|
127 |
_urltotext_fallback_enabled = bool(URLTOTEXT_API_KEY)
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
130 |
|
131 |
-
if
|
132 |
-
|
133 |
-
if not
|
|
|
|
|
134 |
|
135 |
logger.info("Secret loading and configuration check finished.")
|
136 |
logger.info(f"Primary Web Scraper: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED'}")
|
137 |
-
logger.info(f"Fallback Web Scraper 1: BeautifulSoup
|
138 |
logger.info(f"Fallback Web Scraper 2: urltotext.com API {'ENABLED' if _urltotext_fallback_enabled else 'DISABLED'}")
|
139 |
logger.info(f"Primary Summarizer: Gemini ({GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'})")
|
140 |
logger.info(f"Fallback Summarizer: OpenRouter ({OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'})")
|
141 |
-
logger.info(f"Primary YT Transcript: youtube-transcript-api
|
142 |
logger.info(f"Fallback YT Transcript 1: Supadata API {'ENABLED' if SUPADATA_API_KEY else 'DISABLED'}")
|
143 |
logger.info(f"Fallback YT Transcript 2: Apify REST API {'ENABLED' if APIFY_API_TOKEN else 'DISABLED'}")
|
144 |
-
_apify_token_exists = bool(APIFY_API_TOKEN) # Keep this for health check
|
145 |
|
146 |
if _gemini_primary_enabled:
|
147 |
-
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured
|
148 |
-
except Exception as e: logger.error(f"Failed
|
149 |
|
150 |
# --- Constants ---
|
151 |
MAX_SUMMARY_CHUNK_SIZE = 4000
|
152 |
MAX_INPUT_TOKEN_APPROX = 500000
|
153 |
|
154 |
# --- Retry Decorator ---
|
155 |
-
@retry(
|
156 |
-
stop=stop_after_attempt(4),
|
157 |
-
wait=wait_exponential(multiplier=1, min=2, max=15),
|
158 |
-
retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)),
|
159 |
-
before_sleep=before_sleep_log(logger, logging.WARNING),
|
160 |
-
reraise=True
|
161 |
-
)
|
162 |
async def retry_bot_operation(func, *args, **kwargs):
|
163 |
try: return await func(*args, **kwargs)
|
164 |
except BadRequest as e:
|
165 |
-
|
166 |
-
if any(err in str(e).lower() for err in
|
167 |
logger.error(f"Potentially critical BadRequest: {e}"); raise
|
168 |
except TelegramError as e:
|
169 |
-
if isinstance(e, (TimedOut, NetworkError, RetryAfter)): logger.warning(f"
|
170 |
-
else: logger.error(f"Unhandled
|
171 |
raise
|
172 |
-
except Exception as e: logger.error(f"Unexpected error
|
173 |
|
174 |
# --- Helper Functions ---
|
175 |
def is_youtube_url(url):
|
@@ -182,424 +170,320 @@ def extract_youtube_id(url):
|
|
182 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
183 |
|
184 |
# --- Content Fetching Functions ---
|
185 |
-
# (get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript) - No changes
|
186 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
187 |
-
if not video_id: logger.error("[Supadata]
|
188 |
-
|
189 |
-
|
190 |
-
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"; params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
|
191 |
try:
|
192 |
async with httpx.AsyncClient(timeout=30.0) as client:
|
193 |
-
response = await client.get(
|
194 |
-
logger.debug(f"[Supadata] Status
|
195 |
if response.status_code == 200:
|
196 |
try:
|
197 |
data = response.json() if response.text else None; content = None
|
198 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
199 |
if not content and response.text: content = response.text
|
200 |
if content and isinstance(content, str): logger.info(f"[Supadata] Success: len {len(content)}"); return content.strip()
|
201 |
-
else: logger.warning(f"[Supadata] Success but
|
202 |
except Exception as e: logger.error(f"[Supadata] Error processing success: {e}", exc_info=True); return None
|
203 |
-
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check
|
204 |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404)."); return None
|
205 |
-
else: logger.error(f"[Supadata] Unexpected status {response.status_code}. Resp:
|
206 |
-
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout"); return None
|
207 |
-
except httpx.RequestError as e: logger.error(f"[Supadata] Request error: {e}"); return None
|
208 |
except Exception as e: logger.error(f"[Supadata] Unexpected error: {e}", exc_info=True); return None
|
209 |
|
210 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
211 |
global APIFY_ACTOR_ID
|
212 |
-
if not video_url: logger.error("[Apify
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5 }; headers = {"Content-Type": "application/json"}
|
217 |
try:
|
218 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
219 |
-
logger.debug(f"[Apify
|
220 |
-
response = await client.post(
|
221 |
-
logger.debug(f"[Apify
|
222 |
if response.status_code == 200:
|
223 |
try:
|
224 |
results = response.json(); content = None
|
225 |
if isinstance(results, list) and len(results) > 0:
|
226 |
-
item = results[0]
|
227 |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
|
228 |
elif "text" in item and isinstance(item["text"], str): content = item["text"]
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
elif response.status_code ==
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
246 |
global SUPADATA_API_KEY, APIFY_API_TOKEN
|
247 |
-
if not video_id: logger.error("
|
248 |
-
logger.info(f"Fetching transcript for
|
249 |
-
|
250 |
-
logger.info("[Primary YT]
|
251 |
try:
|
252 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch )
|
253 |
-
if transcript_list:
|
254 |
-
if
|
255 |
-
else: logger.warning(f"[Primary YT]
|
256 |
-
except TranscriptsDisabled: logger.warning(f"[Primary YT]
|
257 |
-
except
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
logger.info("[Fallback YT 1]
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
else: logger.warning("[Fallback YT 2] Apify API token unavailable.")
|
275 |
-
|
276 |
-
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
277 |
-
return transcript_text
|
278 |
-
|
279 |
-
# - Website Content Fetching -
|
280 |
-
# (get_website_content_via_crawl4ai, fetch_url_content_for_scrape, get_website_content_bs4, get_website_content_via_api) - No changes from previous answer
|
281 |
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
282 |
global _crawl4ai_primary_web_enabled
|
283 |
-
if not _crawl4ai_primary_web_enabled: logger.error("[Crawl4AI
|
284 |
-
|
285 |
-
logger.info(f"[Crawl4AI Primary] Attempting crawl: {url}")
|
286 |
try:
|
287 |
-
async with AsyncWebCrawler(ignore_robots=True) as crawler:
|
288 |
-
logger.info(f"[Crawl4AI
|
289 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90)
|
|
|
290 |
if result and result.markdown: content = result.markdown.strip()
|
291 |
-
elif result and result.text: content = result.text.strip()
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
except asyncio.TimeoutError: logger.error(f"[Crawl4AI Primary] Timeout"); return None
|
296 |
-
except PermissionError as e: logger.error(f"[Crawl4AI Primary] Permission denied. Path: '{e.filename}'. Error: {e}", exc_info=True); return None # Should not happen now
|
297 |
-
except Exception as e: logger.error(f"[Crawl4AI Primary] Unexpected error: {e}", exc_info=True); return None
|
298 |
|
299 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
300 |
-
headers = { 'User-Agent': '
|
301 |
try:
|
302 |
-
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as
|
303 |
-
logger.debug(f"[
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
except Exception as e: logger.error(f"[Web Scrape BS4] Error reading response text: {e}"); return None
|
310 |
-
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape BS4] HTTP error {e.response.status_code}: {e}"); return None
|
311 |
-
except httpx.TimeoutException: logger.error(f"[Web Scrape BS4] Timeout"); return None
|
312 |
-
except httpx.TooManyRedirects: logger.error(f"[Web Scrape BS4] Too many redirects"); return None
|
313 |
-
except httpx.RequestError as e: logger.error(f"[Web Scrape BS4] Request error: {e}"); return None
|
314 |
-
except Exception as e: logger.error(f"[Web Scrape BS4] Unexpected error: {e}", exc_info=True); return None
|
315 |
|
316 |
async def get_website_content_bs4(url: str) -> Optional[str]:
|
317 |
-
if not url: logger.error("[BS4
|
318 |
-
logger.info(f"[BS4
|
319 |
html_content = await fetch_url_content_for_scrape(url)
|
320 |
-
if not html_content: logger.warning(f"[BS4
|
321 |
try:
|
322 |
def parse_html(content):
|
323 |
-
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
324 |
-
for
|
325 |
-
selectors = ['main', 'article',
|
326 |
-
for
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
if not
|
331 |
-
if not target_element: logger.warning(f"[BS4 Fallback] No body/main found"); return None
|
332 |
-
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
333 |
-
text = re.sub(r'\s{2,}', ' ', " ".join(lines)).strip()
|
334 |
-
if not text: logger.warning(f"[BS4 Fallback] Extracted text empty"); return None
|
335 |
return text
|
336 |
-
|
337 |
-
if
|
338 |
-
else: logger.warning(
|
339 |
-
except Exception as e: logger.error(f"[BS4
|
340 |
|
341 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
342 |
-
if not url: logger.error("[API
|
343 |
-
|
344 |
-
|
345 |
-
api_endpoint = "https://urltotext.com/api/v1/urltotext/"; payload = { "url": url, ... }; headers = { "Authorization": f"Token {api_key}", ... } # Keep details
|
346 |
try:
|
347 |
-
async with httpx.AsyncClient(timeout=45.0) as
|
348 |
-
logger.debug(
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
if
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
elif
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
except httpx.TimeoutException: logger.error(f"[API Fallback] Timeout"); return None
|
363 |
-
except httpx.RequestError as e: logger.error(f"[API Fallback] Request error: {e}"); return None
|
364 |
-
except Exception as e: logger.error(f"[API Fallback] Unexpected error: {e}", exc_info=True); return None
|
365 |
|
366 |
# --- Summarization Functions ---
|
367 |
-
# (_call_gemini, _call_openrouter, generate_summary) - Only _call_gemini changed
|
368 |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
369 |
global GEMINI_MODEL, _gemini_primary_enabled
|
370 |
-
if not _gemini_primary_enabled: logger.error("[Gemini
|
371 |
-
if len(text) > MAX_INPUT_TOKEN_APPROX: logger.warning(f"[Gemini
|
372 |
-
logger.info(f"[Gemini
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
# *** MODIFIED: Disable safety settings ***
|
379 |
-
safety_settings = { category: HarmBlockThreshold.BLOCK_NONE for category in HarmCategory }
|
380 |
-
logger.info("[Gemini Primary] Safety settings disabled (BLOCK_NONE).")
|
381 |
-
|
382 |
-
generation_config = genai.types.GenerationConfig( max_output_tokens=2048, temperature=0.7, )
|
383 |
-
|
384 |
try:
|
385 |
-
model = genai.GenerativeModel(GEMINI_MODEL)
|
386 |
-
|
387 |
-
response:
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
# *** MODIFIED: Convert to string BEFORE upper() ***
|
403 |
-
finish_reason_str = str(finish_reason_val).upper() if finish_reason_val is not None else "UNSPECIFIED"
|
404 |
-
logger.debug(f"[Gemini Primary] Finish reason value: {finish_reason_val} -> {finish_reason_str}")
|
405 |
-
|
406 |
-
candidate_safety_ratings_str = "N/A"
|
407 |
-
if hasattr(candidate, 'safety_ratings') and candidate.safety_ratings:
|
408 |
-
candidate_safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in candidate.safety_ratings])
|
409 |
-
|
410 |
-
# *** MODIFIED: Correctly check success/failure reasons ***
|
411 |
-
success_reasons = ["STOP", "MAX_TOKENS"]
|
412 |
-
# Check if the *string representation* ends with a success reason
|
413 |
-
is_success = any(finish_reason_str.endswith(reason) for reason in success_reasons)
|
414 |
-
|
415 |
-
if is_success:
|
416 |
-
logger.info(f"[Gemini Primary] Generation finished acceptably. Reason: {finish_reason_str}")
|
417 |
-
if finish_reason_str.endswith("MAX_TOKENS"): logger.warning("[Gemini Primary] Output may be truncated.")
|
418 |
-
else: # Treat everything else (SAFETY, RECITATION, OTHER, UNSPECIFIED) as failure
|
419 |
-
error_msg = f"Error: Gemini generation failed or finished unexpectedly. Reason: {finish_reason_str}. Safety: {candidate_safety_ratings_str}"
|
420 |
-
logger.error(f"[Gemini Primary] {error_msg}")
|
421 |
-
return None, error_msg # Return specific error
|
422 |
-
|
423 |
-
# 3. Extract text if finish reason was acceptable
|
424 |
-
summary_text = ""; text_extracted = False
|
425 |
-
try:
|
426 |
-
summary_text = response.text.strip(); text_extracted = True
|
427 |
-
except Exception as e: # Broader catch if response.text fails unexpectedly
|
428 |
-
logger.warning(f"[Gemini Primary] Error accessing response.text: {e}. Trying parts.")
|
429 |
-
if candidate.content and candidate.content.parts:
|
430 |
-
summary_text = "".join(part.text for part in candidate.content.parts if hasattr(part, "text")).strip()
|
431 |
-
text_extracted = True
|
432 |
-
|
433 |
-
if not text_extracted or not summary_text:
|
434 |
-
logger.warning(f"[Gemini Primary] Empty summary despite finish reason '{finish_reason_str}'.")
|
435 |
-
return None, "Error: AI generated an empty summary."
|
436 |
-
|
437 |
-
logger.info(f"[Gemini Primary] Summary extracted successfully (len: {len(summary_text)}).")
|
438 |
-
return summary_text, None
|
439 |
-
|
440 |
-
except AttributeError as e: logger.error(f"[Gemini Primary] Attribute error parsing response: {e}.", exc_info=True); return None, f"Error: Failed to parse Gemini response ({e})."
|
441 |
-
except Exception as e: logger.error(f"[Gemini Primary] Error during API call: {e}", exc_info=True); return None, f"Error: Failed communication with Gemini ({e})."
|
442 |
|
443 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
444 |
-
# ... (Keep existing implementation) ...
|
445 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
446 |
-
if not _openrouter_fallback_enabled: logger.error("[
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", ... }; payload = { "model": OPENROUTER_MODEL, ... }; api_url = "https://openrouter.ai/api/v1/chat/completions"
|
454 |
try:
|
455 |
-
async with httpx.AsyncClient(timeout=120.0) as
|
456 |
-
logger.debug(
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
else: logger.error(f"[OpenRouter Fallback] Invalid structure (choices). Data: {data}"); api_error = data.get("error", {}).get("message", "Unknown"); return None, f"Error: Fallback AI no summary. API msg: {api_error}"
|
469 |
-
except Exception as e: logger.error(f"[OpenRouter Fallback] Error processing success: {e}", exc_info=True); return None, f"Error: Failed processing Fallback AI response ({e})."
|
470 |
-
else: error_message = f"Error: Fallback AI ({OPENROUTER_MODEL}) status {response.status_code}."; try: error_details = response.json().get("error", {}).get("message", response.text[:200]); error_message += f" Details: {error_details}"; except Exception: error_message += f" Response: {response.text[:200]}"; logger.error(f"[OpenRouter Fallback] {error_message}"); return None, error_message
|
471 |
-
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error: {e}", exc_info=True); return None, f"Error: Unexpected issue with Fallback AI ({e})."
|
472 |
|
473 |
async def generate_summary(text: str, summary_type: str) -> str:
|
474 |
-
# ... (Keep existing implementation) ...
|
475 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
476 |
-
logger.info(f"[Summary
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
logger.error("[Summary Generation] Reached end unexpectedly."); final_error = error_message or "Unknown error."; return f"Sorry, error occurred: {final_error}"
|
491 |
|
492 |
# --- Main Processing Task ---
|
493 |
-
# (process_summary_task) - No changes needed from previous answer
|
494 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
495 |
-
task_id = f"{user_id}-{message_id_to_edit or 'new'}"
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
try: background_request = HTTPXRequest(...); bot = Bot(token=bot_token, request=background_request) # Keep bot init
|
501 |
-
except Exception as e: logger.critical(f"[Task {task_id}] Failed background bot init: {e}", exc_info=True); return
|
502 |
-
try:
|
503 |
# Edit original msg to "Processing..."
|
504 |
-
|
505 |
-
if
|
506 |
-
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=
|
507 |
-
except Exception as e: logger.warning(f"[Task {task_id}]
|
508 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
509 |
except Exception: pass
|
510 |
|
511 |
-
#
|
512 |
-
|
513 |
-
if
|
514 |
-
|
515 |
-
if
|
516 |
-
else:
|
517 |
-
if not content and not
|
518 |
-
else:
|
519 |
-
logger.info(f"[Task {task_id}]
|
520 |
-
if not content:
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
# --- Generate Summary ---
|
531 |
-
if content and not user_feedback_message:
|
532 |
-
logger.info(f"[Task {task_id}] Content fetched (len:{len(content)}). Generating summary..."); try: await retry_bot_operation(bot.send_chat_action, ...); except Exception: pass
|
533 |
final_summary = await generate_summary(content, summary_type)
|
534 |
-
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
535 |
else: # Success - Split & Send
|
536 |
-
|
537 |
for line in lines:
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
if
|
543 |
-
if not
|
544 |
-
logger.info(f"[Task {task_id}] Summary
|
545 |
-
|
546 |
-
if
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
for i, part in enumerate(summary_parts[1:], start=2):
|
554 |
-
await asyncio.sleep(0.5);
|
555 |
-
try: await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, ...); logger.debug(f"[Task {task_id}] Sent part {i}/{len(summary_parts)}.")
|
556 |
-
except Exception as part_err: user_feedback_message = f"Failed send part {i}."; success = False; logger.error(...); break
|
557 |
-
if not user_feedback_message: success = True # Confirm overall success
|
558 |
-
|
559 |
-
# --- Handle Failures ---
|
560 |
if not success:
|
561 |
-
if not
|
562 |
-
logger.warning(f"[Task {task_id}] Sending failure feedback: {
|
563 |
try: # Edit original msg with error
|
564 |
-
|
565 |
-
if
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
logger.error(f"[Task {task_id}] Unexpected error in task: {e}", exc_info=True)
|
573 |
-
success = False; user_feedback_message = "Oops! Unexpected error..."
|
574 |
-
if bot: # Try sending final crash error
|
575 |
-
try: edited_crash = False;
|
576 |
-
if status_message_id: try: await retry_bot_operation( bot.edit_message_text, ...); edited_crash=True; except Exception: pass
|
577 |
-
if not edited_crash: await retry_bot_operation( bot.send_message, ... )
|
578 |
-
except Exception as final_err: logger.error(f"[Task {task_id}] Failed final crash error feedback: {final_err}")
|
579 |
finally: # Cleanup
|
580 |
-
if
|
581 |
-
|
582 |
-
except Exception as close_err: logger.warning(f"[Task {task_id}] Error closing BG client: {close_err}")
|
583 |
-
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
584 |
-
|
585 |
|
586 |
# --- Telegram Handlers ---
|
587 |
-
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler) - Only handle_summary_type_callback changed
|
588 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
589 |
user = update.effective_user; mention = user.mention_html()
|
590 |
if not user or not update.message: return
|
591 |
-
logger.info(f"User {user.id} ({user.username or 'N/A'})
|
592 |
-
await update.message.reply_html( f"👋
|
593 |
|
594 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
595 |
user = update.effective_user
|
596 |
if not user or not update.message: return
|
597 |
-
logger.info(f"User {user.id} ({user.username or 'N/A'})
|
598 |
-
help_text = ( "🔍 **How:**\n1. Send link.\n2. Choose
|
599 |
"⚙️ **Tech:**\n"
|
600 |
-
f"•
|
601 |
-
"•
|
602 |
-
f"•
|
603 |
"`/start`, `/help`" )
|
604 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
605 |
|
@@ -607,144 +491,109 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
607 |
if not update.message or not update.message.text: return
|
608 |
url = update.message.text.strip(); user = update.effective_user;
|
609 |
if not user: return
|
610 |
-
|
611 |
-
if not url_pattern.match(url): logger.debug(...); await update.message.reply_text("Invalid URL..."); return
|
612 |
logger.info(f"User {user.id} sent URL: {url}")
|
613 |
context.user_data['url_to_summarize'] = url; context.user_data['original_message_id'] = update.message.message_id
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
await update.message.reply_html( f"Link:\n<code>{escaped_url}</code>\n\nSummary type?", reply_markup=reply_markup, disable_web_page_preview=True )
|
618 |
|
619 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
620 |
query = update.callback_query
|
621 |
if not query or not query.message or not query.from_user: logger.warning("Callback missing data."); return
|
622 |
-
user = query.from_user; summary_type = query.data;
|
623 |
-
try: await query.answer(); logger.debug(f"Ack
|
624 |
-
except Exception as e: logger.warning(f"
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
logger.info(f"User {user.id} chose '{summary_type}' for msg {message_id_to_edit}. URL context: {'Yes' if url else 'No'}")
|
629 |
-
|
630 |
-
if not url:
|
631 |
-
logger.warning(f"No URL in context for user {user.id} (cb {query_id}).")
|
632 |
-
try: await query.edit_message_text(text="Context lost. Please send URL again.", reply_markup=None)
|
633 |
-
except Exception as e: logger.error(f"Failed edit 'URL not found': {e}")
|
634 |
-
return
|
635 |
|
636 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
637 |
-
|
638 |
-
# *** MODIFIED: Expanded try/except blocks ***
|
639 |
if not TELEGRAM_TOKEN:
|
640 |
logger.critical("TG TOKEN missing!")
|
641 |
-
try:
|
642 |
-
|
643 |
-
|
644 |
-
logger.error(f"Failed to edit message for TOKEN error: {e}")
|
645 |
-
return # Stop execution
|
646 |
-
|
647 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
648 |
logger.critical("No AI models available!")
|
649 |
-
try:
|
650 |
-
|
651 |
-
|
652 |
-
logger.error(f"Failed to edit message for AI config error: {e}")
|
653 |
-
return # Stop execution
|
654 |
-
|
655 |
-
# Log warnings if only one AI is available
|
656 |
if not _gemini_primary_enabled: logger.warning("Primary AI unavailable, using fallback.")
|
657 |
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI unavailable, using primary.")
|
658 |
|
659 |
-
logger.info(f"Scheduling task
|
660 |
-
asyncio.ensure_future( process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=
|
661 |
-
context.user_data.pop('url_to_summarize', None)
|
662 |
-
context.
|
663 |
-
logger.debug(f"Cleared context for user {user.id} after scheduling.")
|
664 |
|
665 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
666 |
ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter)
|
667 |
-
if isinstance(context.error, ignore_errors):
|
668 |
-
ignore_messages = ["message is not modified",
|
669 |
err_str = str(context.error).lower()
|
670 |
if any(msg in err_str for msg in ignore_messages) or isinstance(context.error, (TimedOut, NetworkError, RetryAfter)):
|
671 |
-
logger.warning(f"Ignoring known/transient error: {context.error}")
|
672 |
-
|
673 |
-
logger.error("Exception while handling update:", exc_info=context.error)
|
674 |
-
|
675 |
|
676 |
# --- Application Setup ---
|
677 |
-
# (setup_bot_config) - No changes
|
678 |
async def setup_bot_config() -> Application:
|
679 |
-
logger.info("Configuring Telegram
|
680 |
-
if not TELEGRAM_TOKEN: raise ValueError("
|
681 |
-
|
682 |
-
|
683 |
-
application.add_handler(CommandHandler("start", start))
|
684 |
-
application.add_handler(CommandHandler("help", help_command))
|
685 |
url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
|
686 |
-
|
687 |
-
|
688 |
-
application.add_error_handler(error_handler)
|
689 |
-
logger.info("Telegram application handlers configured."); return application
|
690 |
|
691 |
-
# --- ASGI Lifespan & Routes ---
|
692 |
-
# (lifespan, health_check, telegram_webhook) - No changes
|
693 |
@contextlib.asynccontextmanager
|
694 |
async def lifespan(app: Starlette):
|
695 |
-
global ptb_app,
|
696 |
-
if not TELEGRAM_TOKEN: raise RuntimeError("
|
697 |
try:
|
698 |
-
ptb_app = await setup_bot_config(); await ptb_app.initialize()
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
logger.info(f"Webhook set & verified: URL='{webhook_info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}"); if webhook_info.last_error_message: logger.warning(...)
|
719 |
-
await ptb_app.start(); logger.info("PTB started (webhook mode).")
|
720 |
-
except Exception as e: logger.error(f"FATAL: Webhook setup error: {e}", exc_info=True); raise RuntimeError(f"Webhook setup failed: {e}") from e
|
721 |
-
elif not webhook_delete_success: raise RuntimeError("Failed to delete previous webhook.")
|
722 |
-
logger.info("ASGI Lifespan: Startup complete."); yield
|
723 |
-
except Exception as startup_err: logger.critical(f"Startup failed: {startup_err}", exc_info=True); raise
|
724 |
finally: # Shutdown
|
725 |
-
logger.info("
|
726 |
if ptb_app: try: if ptb_app.running: await ptb_app.stop(); if ptb_app._initialized: await ptb_app.shutdown(); logger.info("PTB stopped/shutdown.") except Exception as e: logger.error(...)
|
727 |
-
logger.info("
|
728 |
|
729 |
async def health_check(request: Request) -> PlainTextResponse:
|
730 |
-
global OPENROUTER_MODEL, ...; bot_status = "Not Initialized"; bot_username = "N/A" # Keep
|
731 |
if ptb_app and ptb_app.bot and ptb_app._initialized:
|
732 |
-
try: wh_info = await ptb_app.bot.get_webhook_info() # Keep
|
733 |
-
|
734 |
-
elif ptb_app.running: bot_status = f"Running (Webhook Status: {wh_info.url if wh_info else 'N/A'}...)"
|
735 |
-
else: bot_status = "Initialized/Not running"
|
736 |
-
except Exception as e: logger.error(f"Health check status error: {e}", exc_info=True); bot_status = f"Error checking status: {e}"
|
737 |
elif ptb_app: bot_status = "Initializing..."
|
738 |
-
|
739 |
-
return PlainTextResponse("\n".join(
|
740 |
|
741 |
async def telegram_webhook(request: Request) -> Response:
|
742 |
-
global WEBHOOK_SECRET, ptb_app
|
743 |
-
if not ptb_app or not ptb_app._initialized or not ptb_app.running: status = ...; logger.error(...); return PlainTextResponse(f'Bot {status}', 503)
|
744 |
-
if WEBHOOK_SECRET:
|
745 |
-
try:
|
746 |
except json.JSONDecodeError: logger.error(...); return PlainTextResponse('Bad Request', 400)
|
747 |
-
except Exception as e: logger.error(f"Webhook
|
748 |
|
749 |
# --- ASGI App Definition ---
|
750 |
app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", health_check), Route("/webhook", telegram_webhook, methods=["POST"]), ] )
|
@@ -753,9 +602,9 @@ logger.info("Starlette ASGI application created.")
|
|
753 |
# --- Development Runner ---
|
754 |
if __name__ == '__main__':
|
755 |
import uvicorn; logger.warning("Running DEV mode...") # Keep dev runner
|
756 |
-
log_level =
|
757 |
try: from dotenv import load_dotenv; load_dotenv(); logger.info(".env loaded.")
|
758 |
except ImportError: logger.info(".env not loaded.")
|
759 |
-
if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev:
|
760 |
if not get_secret('GEMINI_API_KEY'): logger.error("Local Dev: GEMINI_API_KEY missing.")
|
761 |
-
uvicorn.run( "main:app", host='0.0.0.0', port=
|
|
|
88 |
# --- Environment Variable Loading & Configuration ---
|
89 |
logger.info("Attempting to load secrets and configuration...")
|
90 |
def get_secret(secret_name):
|
|
|
91 |
value = os.environ.get(secret_name)
|
92 |
+
if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)")
|
|
|
|
|
93 |
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}")
|
94 |
return value
|
95 |
|
96 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
97 |
+
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY')
|
98 |
+
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
|
99 |
+
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
100 |
+
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
101 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
102 |
+
GEMINI_API_KEY = get_secret('GEMINI_API_KEY')
|
103 |
|
104 |
+
# Models
|
105 |
+
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
|
106 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
107 |
+
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.0-flash-001")
|
108 |
|
109 |
# --- Configuration Checks ---
|
110 |
if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
111 |
+
if not GEMINI_API_KEY: logger.error("❌ ERROR: GEMINI_API_KEY not found.")
|
112 |
+
if not OPENROUTER_API_KEY: logger.warning("⚠️ WARNING: OPENROUTER_API_KEY not found.")
|
113 |
|
114 |
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY)
|
|
|
|
|
|
|
115 |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
|
|
|
|
|
116 |
_crawl4ai_primary_web_enabled = _crawl4ai_available
|
|
|
|
|
117 |
_urltotext_fallback_enabled = bool(URLTOTEXT_API_KEY)
|
118 |
+
_apify_token_exists = bool(APIFY_API_TOKEN)
|
119 |
+
|
120 |
+
if not _gemini_available: logger.warning("⚠️ WARNING: google-generativeai missing.")
|
121 |
+
elif not GEMINI_API_KEY: logger.warning("⚠️ WARNING: GEMINI_API_KEY missing.")
|
122 |
+
if not _crawl4ai_available: logger.warning("⚠️ WARNING: crawl4ai missing.")
|
123 |
|
124 |
+
if _urltotext_fallback_enabled: logger.info("ℹ️ INFO: urltotext.com API ENABLED.")
|
125 |
+
else: logger.info("ℹ️ INFO: urltotext.com API disabled.")
|
126 |
+
if not SUPADATA_API_KEY: logger.info("ℹ️ INFO: Supadata API disabled.")
|
127 |
+
if not APIFY_API_TOKEN: logger.info("ℹ️ INFO: Apify API disabled.")
|
128 |
+
if not WEBHOOK_SECRET: logger.info("ℹ️ INFO: Webhook security disabled.")
|
129 |
|
130 |
logger.info("Secret loading and configuration check finished.")
|
131 |
logger.info(f"Primary Web Scraper: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED'}")
|
132 |
+
logger.info(f"Fallback Web Scraper 1: BeautifulSoup")
|
133 |
logger.info(f"Fallback Web Scraper 2: urltotext.com API {'ENABLED' if _urltotext_fallback_enabled else 'DISABLED'}")
|
134 |
logger.info(f"Primary Summarizer: Gemini ({GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'})")
|
135 |
logger.info(f"Fallback Summarizer: OpenRouter ({OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'})")
|
136 |
+
logger.info(f"Primary YT Transcript: youtube-transcript-api")
|
137 |
logger.info(f"Fallback YT Transcript 1: Supadata API {'ENABLED' if SUPADATA_API_KEY else 'DISABLED'}")
|
138 |
logger.info(f"Fallback YT Transcript 2: Apify REST API {'ENABLED' if APIFY_API_TOKEN else 'DISABLED'}")
|
|
|
139 |
|
140 |
if _gemini_primary_enabled:
|
141 |
+
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured.")
|
142 |
+
except Exception as e: logger.error(f"Failed config Google GenAI: {e}"); _gemini_primary_enabled = False
|
143 |
|
144 |
# --- Constants ---
|
145 |
MAX_SUMMARY_CHUNK_SIZE = 4000
|
146 |
MAX_INPUT_TOKEN_APPROX = 500000
|
147 |
|
148 |
# --- Retry Decorator ---
|
149 |
+
@retry( stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=15), retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True )
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
async def retry_bot_operation(func, *args, **kwargs):
|
151 |
try: return await func(*args, **kwargs)
|
152 |
except BadRequest as e:
|
153 |
+
ignore = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
|
154 |
+
if any(err in str(e).lower() for err in ignore): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None
|
155 |
logger.error(f"Potentially critical BadRequest: {e}"); raise
|
156 |
except TelegramError as e:
|
157 |
+
if isinstance(e, (TimedOut, NetworkError, RetryAfter)): logger.warning(f"TG transient error (retry): {e}")
|
158 |
+
else: logger.error(f"Unhandled TG Error: {e}")
|
159 |
raise
|
160 |
+
except Exception as e: logger.error(f"Unexpected error in bot op: {e}", exc_info=True); raise
|
161 |
|
162 |
# --- Helper Functions ---
|
163 |
def is_youtube_url(url):
|
|
|
170 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
171 |
|
172 |
# --- Content Fetching Functions ---
|
|
|
173 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
174 |
+
if not video_id or not api_key: logger.error("[Supadata] Missing video_id or API key"); return None
|
175 |
+
logger.info(f"[Supadata] Fetching: {video_id}")
|
176 |
+
api = "https://api.supadata.ai/v1/youtube/transcript"; params={"v": video_id, "f": "text"}; headers={"X-API-Key": api_key} # Adjusted params slightly based on potential common patterns, check Supadata docs if needed
|
|
|
177 |
try:
|
178 |
async with httpx.AsyncClient(timeout=30.0) as client:
|
179 |
+
response = await client.get(api, headers=headers, params=params)
|
180 |
+
logger.debug(f"[Supadata] Status: {response.status_code}")
|
181 |
if response.status_code == 200:
|
182 |
try:
|
183 |
data = response.json() if response.text else None; content = None
|
184 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
185 |
if not content and response.text: content = response.text
|
186 |
if content and isinstance(content, str): logger.info(f"[Supadata] Success: len {len(content)}"); return content.strip()
|
187 |
+
else: logger.warning(f"[Supadata] Success but empty/invalid content. Resp:{response.text[:200]}"); return None
|
188 |
except Exception as e: logger.error(f"[Supadata] Error processing success: {e}", exc_info=True); return None
|
189 |
+
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check key."); return None
|
190 |
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404)."); return None
|
191 |
+
else: logger.error(f"[Supadata] Unexpected status {response.status_code}. Resp:{response.text[:200]}"); return None
|
|
|
|
|
192 |
except Exception as e: logger.error(f"[Supadata] Unexpected error: {e}", exc_info=True); return None
|
193 |
|
194 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
195 |
global APIFY_ACTOR_ID
|
196 |
+
if not video_url or not api_token: logger.error("[Apify] Missing URL or token"); return None
|
197 |
+
logger.info(f"[Apify] Fetching: {video_url} (Actor: {APIFY_ACTOR_ID})")
|
198 |
+
api = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"; params = {"token": api_token}
|
199 |
+
payload = { "urls": [video_url], "outputFormat": "singleStringText", ... }; headers = {"Content-Type": "application/json"} # Keep payload
|
|
|
200 |
try:
|
201 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
202 |
+
logger.debug(f"[Apify] POST Request...")
|
203 |
+
response = await client.post(api, headers=headers, params=params, json=payload)
|
204 |
+
logger.debug(f"[Apify] Status: {response.status_code}")
|
205 |
if response.status_code == 200:
|
206 |
try:
|
207 |
results = response.json(); content = None
|
208 |
if isinstance(results, list) and len(results) > 0:
|
209 |
+
item = results[0] # Parse item logic remains
|
210 |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
|
211 |
elif "text" in item and isinstance(item["text"], str): content = item["text"]
|
212 |
+
# ... other parsing options
|
213 |
+
if content and isinstance(content, str): logger.info(f"[Apify] Success: len {len(content)}"); return content.strip()
|
214 |
+
else: logger.warning(f"[Apify] Parsed but empty/invalid content."); return None
|
215 |
+
else: logger.warning(f"[Apify] Success but empty dataset."); return None
|
216 |
+
except Exception as e: logger.error(f"[Apify] Error processing success: {e}", exc_info=True); return None
|
217 |
+
elif response.status_code == 400: logger.error(f"[Apify] Bad Request (400). Resp:{response.text[:200]}"); return None
|
218 |
+
elif response.status_code == 401: logger.error("[Apify] Auth error (401). Check token."); return None
|
219 |
+
# *** MODIFIED: Expanded elif block for 404 ***
|
220 |
+
elif response.status_code == 404:
|
221 |
+
error_info = "Unknown 404 Error" # Default message
|
222 |
+
try:
|
223 |
+
# Try to get more specific error from JSON response
|
224 |
+
error_data = response.json()
|
225 |
+
error_info = error_data.get("error", {}).get("message", "No specific message in error object")
|
226 |
+
except Exception:
|
227 |
+
# If JSON parsing fails or structure is unexpected, use raw text
|
228 |
+
error_info = response.text[:200] # Use beginning of response text
|
229 |
+
logger.error(f"[Apify] Not Found (404). Error: '{error_info}'")
|
230 |
+
return None # Return None after logging
|
231 |
+
else: logger.error(f"[Apify] Unexpected status {response.status_code}. Resp:{response.text[:200]}"); return None
|
232 |
+
except Exception as e: logger.error(f"[Apify] Unexpected error: {e}", exc_info=True); return None
|
233 |
|
234 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
235 |
global SUPADATA_API_KEY, APIFY_API_TOKEN
|
236 |
+
if not video_id: logger.error("YT transcript: No video_id"); return None
|
237 |
+
logger.info(f"Fetching YT transcript for: {video_id}")
|
238 |
+
transcript = None
|
239 |
+
logger.info("[Primary YT] Trying youtube-transcript-api...")
|
240 |
try:
|
241 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en', 'en-GB', 'en-US']).fetch )
|
242 |
+
if transcript_list: transcript = " ".join([i['text'] for i in transcript_list if 'text' in i]).strip()
|
243 |
+
if transcript: logger.info(f"[Primary YT] Success: len {len(transcript)}"); return transcript
|
244 |
+
else: logger.warning(f"[Primary YT] Empty transcript list/text"); transcript = None
|
245 |
+
except (TranscriptsDisabled, NoTranscriptFound) as e: logger.warning(f"[Primary YT] Known issue: {type(e).__name__}"); transcript = None
|
246 |
+
except Exception as e: logger.warning(f"[Primary YT] Error: {e}"); transcript = None
|
247 |
+
|
248 |
+
if transcript is None and SUPADATA_API_KEY:
|
249 |
+
logger.info("[Fallback YT 1] Trying Supadata..."); transcript = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
|
250 |
+
if transcript: logger.info("[Fallback YT 1] Success via Supadata."); return transcript
|
251 |
+
else: logger.warning("[Fallback YT 1] Supadata failed.")
|
252 |
+
elif transcript is None: logger.warning("[Fallback YT 1] Supadata key missing.")
|
253 |
+
|
254 |
+
if transcript is None and APIFY_API_TOKEN:
|
255 |
+
logger.info("[Fallback YT 2] Trying Apify..."); transcript = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
256 |
+
if transcript: logger.info("[Fallback YT 2] Success via Apify."); return transcript
|
257 |
+
else: logger.warning("[Fallback YT 2] Apify failed.")
|
258 |
+
elif transcript is None: logger.warning("[Fallback YT 2] Apify token missing.")
|
259 |
+
|
260 |
+
if transcript is None: logger.error(f"All YT methods failed: {video_id}"); return None
|
261 |
+
return transcript # Should already be stripped if successful
|
262 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
264 |
global _crawl4ai_primary_web_enabled
|
265 |
+
if not _crawl4ai_primary_web_enabled or not url: logger.error(f"[Crawl4AI] Disabled or no URL."); return None
|
266 |
+
logger.info(f"[Crawl4AI] Crawling: {url}")
|
|
|
267 |
try:
|
268 |
+
async with AsyncWebCrawler(ignore_robots=True) as crawler:
|
269 |
+
logger.info(f"[Crawl4AI] Initialized (ignore_robots=True).")
|
270 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90)
|
271 |
+
content = None
|
272 |
if result and result.markdown: content = result.markdown.strip()
|
273 |
+
elif result and result.text: content = result.text.strip()
|
274 |
+
if content: logger.info(f"[Crawl4AI] Success: len {len(content)}"); return content
|
275 |
+
else: logger.warning(f"[Crawl4AI] Crawl success but empty content."); return None
|
276 |
+
except Exception as e: logger.error(f"[Crawl4AI] Error: {e}", exc_info=True); return None
|
|
|
|
|
|
|
277 |
|
278 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
279 |
+
headers = { 'User-Agent': 'Mozilla/5.0 ...', ... } # Keep headers
|
280 |
try:
|
281 |
+
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as c:
|
282 |
+
r = await c.get(url); logger.debug(f"[BS4 Fetch] Status {r.status_code}"); r.raise_for_status()
|
283 |
+
ct = r.headers.get('content-type','').lower()
|
284 |
+
if 'html' not in ct: logger.warning(f"[BS4 Fetch] Non-HTML: {ct}"); return None
|
285 |
+
try: return r.text
|
286 |
+
except Exception as e: logger.error(f"[BS4 Fetch] Read error: {e}"); return None
|
287 |
+
except Exception as e: logger.error(f"[BS4 Fetch] Error: {e}", exc_info=True); return None
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
|
289 |
async def get_website_content_bs4(url: str) -> Optional[str]:
|
290 |
+
if not url: logger.error("[BS4] No URL"); return None
|
291 |
+
logger.info(f"[BS4] Fetching & parsing: {url}")
|
292 |
html_content = await fetch_url_content_for_scrape(url)
|
293 |
+
if not html_content: logger.warning(f"[BS4] Fetch failed"); return None
|
294 |
try:
|
295 |
def parse_html(content):
|
296 |
+
soup = BeautifulSoup(content, DEFAULT_PARSER) # Parse logic remains
|
297 |
+
for el in soup(["script", "style", ...]): el.extract()
|
298 |
+
selectors = ['main', 'article', ...]; target = None
|
299 |
+
for sel in selectors: try: target=soup.select_one(sel); except Exception: continue; if target: break
|
300 |
+
if not target: target = soup.body
|
301 |
+
if not target: logger.warning("[BS4] No body/main"); return None
|
302 |
+
text = re.sub(r'\s{2,}', ' ', " ".join(l.strip() for l in target.get_text('\n', strip=True).splitlines() if l.strip())).strip()
|
303 |
+
if not text: logger.warning("[BS4] Empty text"); return None
|
|
|
|
|
|
|
|
|
304 |
return text
|
305 |
+
text = await asyncio.to_thread(parse_html, html_content)
|
306 |
+
if text: logger.info(f"[BS4] Success: len {len(text)}"); return text
|
307 |
+
else: logger.warning("[BS4] Parsing empty"); return None
|
308 |
+
except Exception as e: logger.error(f"[BS4] Parse error: {e}", exc_info=True); return None
|
309 |
|
310 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
311 |
+
if not url or not api_key: logger.error("[API] Missing URL or key"); return None
|
312 |
+
logger.info(f"[API] Fetching via urltotext.com: {url}")
|
313 |
+
api = "https://urltotext.com/api/v1/urltotext/"; payload={...}; headers={...} # Keep details
|
|
|
314 |
try:
|
315 |
+
async with httpx.AsyncClient(timeout=45.0) as c:
|
316 |
+
logger.debug("[API] Sending request...")
|
317 |
+
r = await c.post(api, headers=headers, json=payload); logger.debug(f"[API] Status {r.status_code}")
|
318 |
+
if r.status_code == 200:
|
319 |
+
try: # Parse success response
|
320 |
+
data=r.json(); content=data.get("data",{}).get("content"); credits=...; warning=...
|
321 |
+
if warning: logger.warning(f"[API] Warning: {warning}")
|
322 |
+
if content: logger.info(f"[API] Success: len {len(content)}, Credits: {credits}"); return content.strip()
|
323 |
+
else: logger.warning(f"[API] Success but empty content. Resp:{data}"); return None
|
324 |
+
except Exception as e: logger.error(f"[API] Error processing success: {e}", exc_info=True); return None
|
325 |
+
elif r.status_code == 402: logger.error(f"[API] Error 402 (Credits?). Resp:{r.text[:200]}"); return None
|
326 |
+
elif r.status_code == 400 and "url" in r.text.lower(): logger.error(f"[API] Error 400 (Bad URL?). Resp:{r.text[:200]}"); return None
|
327 |
+
# ... other error codes
|
328 |
+
else: logger.error(f"[API] Unexpected status {r.status_code}. Resp:{r.text[:200]}"); return None
|
329 |
+
except Exception as e: logger.error(f"[API] Unexpected error: {e}", exc_info=True); return None
|
|
|
|
|
|
|
330 |
|
331 |
# --- Summarization Functions ---
|
|
|
332 |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
333 |
global GEMINI_MODEL, _gemini_primary_enabled
|
334 |
+
if not _gemini_primary_enabled: logger.error("[Gemini] Disabled."); return None, "Error: Primary AI unavailable."
|
335 |
+
if len(text) > MAX_INPUT_TOKEN_APPROX: logger.warning(f"[Gemini] Truncating input..."); text = text[:MAX_INPUT_TOKEN_APPROX]
|
336 |
+
logger.info(f"[Gemini] Generating {summary_type} summary ({GEMINI_MODEL}). Input len: {len(text)}")
|
337 |
+
if summary_type == "paragraph": prompt = f"Summarise paragraph...\n{text}\nSummary:"
|
338 |
+
elif summary_type == "points": prompt = f"Summarise points...\n{text}\nSummary:"
|
339 |
+
else: logger.error(f"[Gemini] Invalid type: {summary_type}"); return None, f"Error: Invalid summary type."
|
340 |
+
safety_settings = { category: HarmBlockThreshold.BLOCK_NONE for category in HarmCategory }; logger.info("[Gemini] Safety settings disabled.")
|
341 |
+
gen_config = genai.types.GenerationConfig( max_output_tokens=2048, temperature=0.7 )
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
try:
|
343 |
+
model = genai.GenerativeModel(GEMINI_MODEL); logger.debug("[Gemini] Sending request...")
|
344 |
+
response = await model.generate_content_async( prompt, generation_config=gen_config, safety_settings=safety_settings )
|
345 |
+
if not response.candidates: # Check prompt feedback
|
346 |
+
block_reason=str(response.prompt_feedback.block_reason or "N/A") if hasattr(response,'prompt_feedback') else "Unknown"
|
347 |
+
logger.error(f"[Gemini] Response blocked (Prompt). Reason: {block_reason}"); return None, f"Error: Gemini blocked prompt ({block_reason})."
|
348 |
+
candidate = response.candidates[0] # Check candidate finish reason
|
349 |
+
finish_val = candidate.finish_reason; finish_str = str(finish_val).upper() if finish_val else "UNSPECIFIED"; logger.debug(f"[Gemini] Finish reason: {finish_val} -> {finish_str}")
|
350 |
+
ratings = ", ".join([f"{r.category.name}:{r.probability.name}" for r in candidate.safety_ratings]) if hasattr(candidate,'safety_ratings') and candidate.safety_ratings else "N/A"
|
351 |
+
success = any(finish_str.endswith(reason) for reason in ["STOP", "MAX_TOKENS"])
|
352 |
+
if not success: logger.error(f"[Gemini] Failed/Unexpected finish. Reason: {finish_str}. Safety: {ratings}"); return None, f"Error: Gemini failed ({finish_str})."
|
353 |
+
if finish_str.endswith("MAX_TOKENS"): logger.warning("[Gemini] Output truncated.")
|
354 |
+
summary = ""; extracted = False # Extract text
|
355 |
+
try: summary = response.text.strip(); extracted = True
|
356 |
+
except Exception as e: logger.warning(f"[Gemini] response.text error: {e}. Trying parts."); if candidate.content and candidate.content.parts: summary="".join(p.text for p in candidate.content.parts if hasattr(p,"text")).strip(); extracted = True
|
357 |
+
if not extracted or not summary: logger.warning(f"[Gemini] Empty summary despite finish '{finish_str}'."); return None, "Error: AI generated empty summary."
|
358 |
+
logger.info(f"[Gemini] Summary extracted: len {len(summary)}."); return summary, None
|
359 |
+
except Exception as e: logger.error(f"[Gemini] Error: {e}", exc_info=True); return None, f"Error: Failed communication with Gemini ({e})."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
|
361 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
|
|
362 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
363 |
+
if not _openrouter_fallback_enabled: logger.error("[OR] Disabled."); return None, "Error: Fallback AI unavailable."
|
364 |
+
if len(text) > 100000: logger.warning("[OR] Truncating input..."); text = text[:100000]
|
365 |
+
logger.info(f"[OR] Generating {summary_type} summary ({OPENROUTER_MODEL}). Input len: {len(text)}")
|
366 |
+
if summary_type == "paragraph": prompt = f"Summarise paragraph...\n{text}\nSummary:"
|
367 |
+
elif summary_type == "points": prompt = f"Summarise points...\n{text}\nSummary:"
|
368 |
+
else: logger.error(f"[OR] Invalid type: {summary_type}"); return None, "Error: Invalid summary type."
|
369 |
+
headers={...}; payload={...}; api="https://openrouter.ai/api/v1/chat/completions" # Keep details
|
|
|
370 |
try:
|
371 |
+
async with httpx.AsyncClient(timeout=120.0) as c:
|
372 |
+
logger.debug("[OR] Sending request..."); r = await c.post(api, headers=headers, json=payload); logger.debug(f"[OR] Status {r.status_code}")
|
373 |
+
if r.status_code == 200:
|
374 |
+
try: # Parse success
|
375 |
+
data=r.json(); choice=data.get("choices",[{}])[0]; msg=choice.get("message"); finish=choice.get("finish_reason","N/A")
|
376 |
+
if msg and msg.get("content"): summary = msg["content"].strip()
|
377 |
+
else: summary = None
|
378 |
+
if summary: logger.info(f"[OR] Success: len {len(summary)}, Finish: {finish}"); if finish=='length': logger.warning("..."); return summary, None
|
379 |
+
else: logger.warning(f"[OR] Empty summary content. Data:{data}"); return None, "Error: Fallback AI empty summary."
|
380 |
+
except Exception as e: logger.error(f"[OR] Error processing success: {e}", exc_info=True); return None, f"Error: Failed processing Fallback AI response ({e})."
|
381 |
+
else: # Handle errors
|
382 |
+
err_msg = f"Error: Fallback AI ({OPENROUTER_MODEL}) status {r.status_code}."; try: details=r.json().get("error",{}).get("message",r.text[:200]); err_msg+=f" Details:{details}"; except Exception: err_msg+=f" Resp:{r.text[:200]}"; logger.error(f"[OR] {err_msg}"); return None, err_msg
|
383 |
+
except Exception as e: logger.error(f"[OR] Unexpected error: {e}", exc_info=True); return None, f"Error: Unexpected issue with Fallback AI ({e})."
|
|
|
|
|
|
|
|
|
384 |
|
385 |
async def generate_summary(text: str, summary_type: str) -> str:
|
|
|
386 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
387 |
+
logger.info(f"[Summary] Starting..."); err_msg = None
|
388 |
+
if _gemini_primary_enabled: # Try Gemini
|
389 |
+
logger.info(f"[Summary] Trying Gemini ({GEMINI_MODEL})")
|
390 |
+
summary, err = await _call_gemini(text, summary_type)
|
391 |
+
if summary: logger.info("[Summary] Success with Gemini."); return summary
|
392 |
+
else: logger.warning(f"[Summary] Gemini failed: {err}. Falling back."); err_msg = f"Primary AI failed: {err}"
|
393 |
+
else: logger.warning("[Summary] Gemini disabled."); err_msg = "Primary AI unavailable."
|
394 |
+
if _openrouter_fallback_enabled: # Try OpenRouter
|
395 |
+
logger.info(f"[Summary] Trying OpenRouter ({OPENROUTER_MODEL})")
|
396 |
+
summary, err = await _call_openrouter(text, summary_type)
|
397 |
+
if summary: logger.info("[Summary] Success with OpenRouter."); return summary
|
398 |
+
else: logger.error(f"[Summary] OpenRouter also failed: {err}"); if err_msg: return f"{err_msg}\nFallback failed: {err}"; else: return f"Fallback AI failed: {err}"
|
399 |
+
else: logger.error("[Summary] OpenRouter disabled."); if err_msg: return f"{err_msg}\nFallback unavailable."; else: return "Error: Both AI unavailable."
|
400 |
+
logger.error("[Summary] Reached end unexpectedly."); final_err = err_msg or "Unknown error."; return f"Sorry, error: {final_err}"
|
|
|
401 |
|
402 |
# --- Main Processing Task ---
|
|
|
403 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
404 |
+
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting: {url}")
|
405 |
+
bot: Optional[Bot] = None; content: Optional[str] = None; feedback: Optional[str] = None; success = False; original_msg_id = message_id_to_edit
|
406 |
+
try: bot = Bot(token=bot_token, request=HTTPXRequest(...)) # Keep bot init
|
407 |
+
except Exception as e: logger.critical(f"[Task {task_id}] Failed bot init: {e}", exc_info=True); return
|
408 |
+
try: # Main task logic
|
|
|
|
|
|
|
409 |
# Edit original msg to "Processing..."
|
410 |
+
proc_msg = f"Generating '{summary_type}' summary...\nThis might take a moment..."
|
411 |
+
if original_msg_id:
|
412 |
+
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=original_msg_id, text=proc_msg, ...); logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id}")
|
413 |
+
except Exception as e: logger.warning(f"[Task {task_id}] Failed edit original msg {original_msg_id}: {e}.")
|
414 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
415 |
except Exception: pass
|
416 |
|
417 |
+
# Get Content (YT or Web)
|
418 |
+
is_yt = is_youtube_url(url); logger.debug(f"[Task {task_id}] Type: {'YT' if is_yt else 'Web'}")
|
419 |
+
if is_yt: # YT Logic
|
420 |
+
vid = extract_youtube_id(url)
|
421 |
+
if vid: content = await get_youtube_transcript(vid, url)
|
422 |
+
else: feedback = "Invalid YT URL."
|
423 |
+
if not content and not feedback: feedback = "Could not get YT transcript."
|
424 |
+
else: # Web Logic
|
425 |
+
logger.info(f"[Task {task_id}] Trying Crawl4AI..."); content = await get_website_content_via_crawl4ai(url)
|
426 |
+
if not content: logger.warning(f"[Task {task_id}] Crawl4AI fail. Trying BS4..."); try: await retry_bot_operation(bot.send_chat_action,...); except Exception: pass; content = await get_website_content_bs4(url)
|
427 |
+
if not content: logger.warning(f"[Task {task_id}] BS4 fail. Trying API..."); global URLTOTEXT_API_KEY, _urltotext_fallback_enabled
|
428 |
+
if not content and _urltotext_fallback_enabled: try: await retry_bot_operation(bot.send_chat_action,...); except Exception: pass; content = await get_website_content_via_api(url, URLTOTEXT_API_KEY); if not content: feedback = "Fetch failed (Crawl4AI/BS4/API fail/credits?)."
|
429 |
+
elif not content: feedback = "Fetch failed (Crawl4AI/BS4 fail, API disabled)."
|
430 |
+
if not content and not feedback: feedback = "Could not fetch website content."
|
431 |
+
|
432 |
+
# Generate Summary
|
433 |
+
if content and not feedback:
|
434 |
+
logger.info(f"[Task {task_id}] Got content (len:{len(content)}). Generating summary..."); try: await retry_bot_operation(bot.send_chat_action,...); except Exception: pass
|
|
|
|
|
|
|
|
|
435 |
final_summary = await generate_summary(content, summary_type)
|
436 |
+
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"): feedback = final_summary; logger.warning(f"[Task {task_id}] Summary gen failed: {feedback}"); success = False
|
437 |
else: # Success - Split & Send
|
438 |
+
parts = []; current = ""; lines = final_summary.splitlines(True) # Keep splitting
|
439 |
for line in lines:
|
440 |
+
if len(current) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
|
441 |
+
if current.strip(): parts.append(current.strip())
|
442 |
+
current = line[:MAX_SUMMARY_CHUNK_SIZE] if len(line) > MAX_SUMMARY_CHUNK_SIZE else line
|
443 |
+
else: current += line
|
444 |
+
if current.strip(): parts.append(current.strip())
|
445 |
+
if not parts: parts.append("Summary empty."); logger.warning(...)
|
446 |
+
logger.info(f"[Task {task_id}] Summary OK (len:{len(final_summary)}). Sending {len(parts)} part(s).")
|
447 |
+
edited = False # Edit original msg with first part
|
448 |
+
if original_msg_id: try: await retry_bot_operation(bot.edit_message_text, chat_id, original_msg_id, parts[0], ...); logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id} part 1."); edited = True; except Exception as e: logger.warning(f"[Task {task_id}] Failed edit original msg {original_msg_id} part 1: {e}. Sending new.")
|
449 |
+
if not edited: sent = await retry_bot_operation(bot.send_message, chat_id, parts[0], ...); if not sent: feedback = "Failed send summary."; success = False; logger.error(...) # Send first part new if needed
|
450 |
+
if success and len(parts) > 1: # Send remaining parts
|
451 |
+
for i, part in enumerate(parts[1:], 2): await asyncio.sleep(0.5); try: await retry_bot_operation(bot.send_message, chat_id, part, ...); logger.debug(f"[Task {task_id}] Sent part {i}/{len(parts)}."); except Exception as e: feedback=f"Failed send part {i}."; success=False; logger.error(...); break
|
452 |
+
if not feedback: success = True # Confirm success
|
453 |
+
|
454 |
+
# Handle Failures
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
if not success:
|
456 |
+
if not feedback: feedback = "Unknown error."; logger.error(...)
|
457 |
+
logger.warning(f"[Task {task_id}] Sending failure feedback: {feedback}")
|
458 |
try: # Edit original msg with error
|
459 |
+
edited_err = False
|
460 |
+
if original_msg_id: try: await retry_bot_operation(bot.edit_message_text, chat_id, original_msg_id, feedback, ...); logger.debug(f"[Task {task_id}] Edited original msg {original_msg_id} with error."); edited_err=True; except Exception as e: logger.warning(f"[Task {task_id}] Failed edit original msg {original_msg_id} with error: {e}. Sending new.")
|
461 |
+
if not edited_err: await retry_bot_operation(bot.send_message, chat_id, feedback, ...); logger.debug(f"[Task {task_id}] Sent error new msg.")
|
462 |
+
except Exception as e: logger.error(f"[Task {task_id}] Failed even sending error feedback: {e}")
|
463 |
+
|
464 |
+
except Exception as e: # Catch-all for task
|
465 |
+
logger.error(f"[Task {task_id}] Unexpected task error: {e}", exc_info=True); success = False; feedback = "Oops! Unexpected error..."
|
466 |
+
if bot: try: edited=False; if original_msg_id: try: await retry_bot_operation(bot.edit_message_text,...); edited=True; except Exception: pass; if not edited: await retry_bot_operation(bot.send_message,...) except Exception as final_e: logger.error(f"[Task {task_id}] Failed final crash error feedback: {final_e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
467 |
finally: # Cleanup
|
468 |
+
if bot and bot.request and hasattr(bot.request, '_client') and bot.request._client: try: await bot.request._client.aclose(); logger.debug(f"[Task {task_id}] BG client closed.") except Exception as e: logger.warning(f"[Task {task_id}] Error closing BG client: {e}")
|
469 |
+
logger.info(f"[Task {task_id}] Completed. Success: {success}")
|
|
|
|
|
|
|
470 |
|
471 |
# --- Telegram Handlers ---
|
|
|
472 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
473 |
user = update.effective_user; mention = user.mention_html()
|
474 |
if not user or not update.message: return
|
475 |
+
logger.info(f"User {user.id} ({user.username or 'N/A'}) /start.")
|
476 |
+
await update.message.reply_html( f"👋 {mention}! Send YT/website link to summarise." )
|
477 |
|
478 |
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
479 |
user = update.effective_user
|
480 |
if not user or not update.message: return
|
481 |
+
logger.info(f"User {user.id} ({user.username or 'N/A'}) /help.")
|
482 |
+
help_text = ( "🔍 **How:**\n1. Send link.\n2. Choose type.\n3. Wait.\n\n"
|
483 |
"⚙️ **Tech:**\n"
|
484 |
+
f"• Web: `Crawl4AI` (ignores robots), `BS4`, `urltotext`.\n"
|
485 |
+
f"• YT: `youtube-transcript-api`, `Supadata`, `Apify`.\n"
|
486 |
+
f"• AI: `{GEMINI_MODEL}`, `{OPENROUTER_MODEL}`.\n\n"
|
487 |
"`/start`, `/help`" )
|
488 |
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
|
489 |
|
|
|
491 |
if not update.message or not update.message.text: return
|
492 |
url = update.message.text.strip(); user = update.effective_user;
|
493 |
if not user: return
|
494 |
+
if not re.match(r'https?://[^\s/$.?#].[^\s]*', url, re.I): logger.debug(...); await update.message.reply_text("Invalid URL..."); return
|
|
|
495 |
logger.info(f"User {user.id} sent URL: {url}")
|
496 |
context.user_data['url_to_summarize'] = url; context.user_data['original_message_id'] = update.message.message_id
|
497 |
+
keys = [[ InlineKeyboardButton("Para", callback_data="paragraph"), InlineKeyboardButton("Points", callback_data="points") ]]
|
498 |
+
markup = InlineKeyboardMarkup(keys)
|
499 |
+
await update.message.reply_html( f"Link:\n<code>{html.escape(url)}</code>\n\nSummary type?", reply_markup=markup, disable_web_page_preview=True )
|
|
|
500 |
|
501 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
502 |
query = update.callback_query
|
503 |
if not query or not query.message or not query.from_user: logger.warning("Callback missing data."); return
|
504 |
+
user = query.from_user; summary_type = query.data; qid = query.id
|
505 |
+
try: await query.answer(); logger.debug(f"Ack cb {qid} from {user.id}")
|
506 |
+
except Exception as e: logger.warning(f"Err answering cb {qid}: {e}")
|
507 |
+
url = context.user_data.get('url_to_summarize'); msg_id = query.message.message_id
|
508 |
+
logger.info(f"User {user.id} chose '{summary_type}' for msg {msg_id}. URL context: {'Yes' if url else 'No'}")
|
509 |
+
if not url: logger.warning(f"No URL context user {user.id} (cb {qid})."); try: await query.edit_message_text("Context lost...", reply_markup=None); except Exception as e: logger.error(f"Failed edit 'URL not found': {e}"); return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
|
511 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
512 |
+
# Expanded checks from previous answer
|
|
|
513 |
if not TELEGRAM_TOKEN:
|
514 |
logger.critical("TG TOKEN missing!")
|
515 |
+
try: await query.edit_message_text("❌ Bot config error (Token).", reply_markup=None)
|
516 |
+
except Exception as e: logger.error(f"Failed edit msg for TOKEN error: {e}")
|
517 |
+
return
|
|
|
|
|
|
|
518 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
519 |
logger.critical("No AI models available!")
|
520 |
+
try: await query.edit_message_text("❌ AI config error (Models).", reply_markup=None)
|
521 |
+
except Exception as e: logger.error(f"Failed edit msg for AI config error: {e}")
|
522 |
+
return
|
|
|
|
|
|
|
|
|
523 |
if not _gemini_primary_enabled: logger.warning("Primary AI unavailable, using fallback.")
|
524 |
elif not _openrouter_fallback_enabled: logger.warning("Fallback AI unavailable, using primary.")
|
525 |
|
526 |
+
logger.info(f"Scheduling task user {user.id}...")
|
527 |
+
asyncio.ensure_future( process_summary_task( user_id=user.id, chat_id=query.message.chat_id, message_id_to_edit=msg_id, url=url, summary_type=summary_type, bot_token=TELEGRAM_TOKEN ) )
|
528 |
+
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None)
|
529 |
+
logger.debug(f"Cleared context user {user.id} post-schedule.")
|
|
|
530 |
|
531 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
532 |
ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter)
|
533 |
+
if isinstance(context.error, ignore_errors): # Keep ignore logic
|
534 |
+
ignore_messages = ["message is not modified", ...]
|
535 |
err_str = str(context.error).lower()
|
536 |
if any(msg in err_str for msg in ignore_messages) or isinstance(context.error, (TimedOut, NetworkError, RetryAfter)):
|
537 |
+
logger.warning(f"Ignoring known/transient error: {context.error}"); return
|
538 |
+
logger.error("Exception handling update:", exc_info=context.error)
|
|
|
|
|
539 |
|
540 |
# --- Application Setup ---
|
541 |
+
# (setup_bot_config, lifespan, health_check, telegram_webhook, ASGI App Def, Dev Runner) - No changes needed
|
542 |
async def setup_bot_config() -> Application:
|
543 |
+
logger.info("Configuring Telegram App..."); global TELEGRAM_TOKEN
|
544 |
+
if not TELEGRAM_TOKEN: raise ValueError("TG TOKEN missing.")
|
545 |
+
req = HTTPXRequest(...); app = ( Application.builder().token(TELEGRAM_TOKEN).request(req).build() )
|
546 |
+
app.add_handler(CommandHandler("start", start)); app.add_handler(CommandHandler("help", help_command))
|
|
|
|
|
547 |
url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
|
548 |
+
app.add_handler(MessageHandler(url_filter, handle_potential_url)); app.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
549 |
+
app.add_error_handler(error_handler); logger.info("TG handlers configured."); return app
|
|
|
|
|
550 |
|
|
|
|
|
551 |
@contextlib.asynccontextmanager
|
552 |
async def lifespan(app: Starlette):
|
553 |
+
global ptb_app, ...; logger.info("Lifespan: Startup..."); # Keep lifespan logic
|
554 |
+
if not TELEGRAM_TOKEN: raise RuntimeError("TG token missing.")
|
555 |
try:
|
556 |
+
ptb_app = await setup_bot_config(); await ptb_app.initialize(); bot_info = await ptb_app.bot.get_me(); logger.info(f"Bot init: @{bot_info.username}")
|
557 |
+
# Webhook setup logic remains the same
|
558 |
+
current_info = await ptb_app.bot.get_webhook_info(); deleted_ok = True
|
559 |
+
if current_info and current_info.url: logger.info(f"Deleting webhook: {current_info.url}..."); try: deleted_ok = await ptb_app.bot.delete_webhook(...); logger.info("WH deleted." if deleted_ok else "WH delete fail.") except Exception as e: logger.warning(f"WH delete err: {e}"); deleted_ok = False
|
560 |
+
host = os.environ.get("SPACE_HOST"); path="/webhook";
|
561 |
+
if not host: raise RuntimeError("SPACE_HOST missing.")
|
562 |
+
wh_url = f"https://{host.split('://')[-1].rstrip('/')}{path}"
|
563 |
+
if wh_url and deleted_ok:
|
564 |
+
logger.info(f"Setting webhook: {wh_url}"); args = {...}; if WEBHOOK_SECRET: args["secret_token"] = WEBHOOK_SECRET;
|
565 |
+
await asyncio.sleep(1.0);
|
566 |
+
try:
|
567 |
+
if not await ptb_app.bot.set_webhook(**args): raise RuntimeError("set_webhook False.")
|
568 |
+
await asyncio.sleep(1.5); info = await ptb_app.bot.get_webhook_info()
|
569 |
+
if not (info and info.url == wh_url): raise RuntimeError(f"WH verify fail! Expected '{wh_url}', Got: {info}")
|
570 |
+
logger.info(f"WH set & verified: URL='{info.url}', Secret={'YES' if WEBHOOK_SECRET else 'NO'}"); if info.last_error_message: logger.warning(...)
|
571 |
+
await ptb_app.start(); logger.info("PTB started (webhook).")
|
572 |
+
except Exception as e: logger.error(f"FATAL: WH setup error: {e}", exc_info=True); raise RuntimeError(f"WH setup fail: {e}") from e
|
573 |
+
elif not deleted_ok: raise RuntimeError("Failed previous WH delete.")
|
574 |
+
logger.info("Lifespan: Startup complete."); yield
|
575 |
+
except Exception as startup_err: logger.critical(f"Startup fail: {startup_err}", exc_info=True); raise
|
|
|
|
|
|
|
|
|
|
|
|
|
576 |
finally: # Shutdown
|
577 |
+
logger.info("Lifespan: Shutdown...");
|
578 |
if ptb_app: try: if ptb_app.running: await ptb_app.stop(); if ptb_app._initialized: await ptb_app.shutdown(); logger.info("PTB stopped/shutdown.") except Exception as e: logger.error(...)
|
579 |
+
logger.info("Lifespan: Shutdown complete.")
|
580 |
|
581 |
async def health_check(request: Request) -> PlainTextResponse:
|
582 |
+
global OPENROUTER_MODEL, ...; bot_status = "Not Initialized"; bot_username = "N/A" # Keep health check logic
|
583 |
if ptb_app and ptb_app.bot and ptb_app._initialized:
|
584 |
+
try: wh_info = await ptb_app.bot.get_webhook_info(); ... # Keep status check
|
585 |
+
except Exception as e: ...; bot_status = f"Error: {e}"
|
|
|
|
|
|
|
586 |
elif ptb_app: bot_status = "Initializing..."
|
587 |
+
info = [ f"=== Bot Status ===", f"Application: {bot_status}", "--- Services ---", f"Web Scraper: {'Crawl4AI (ignore_robots)' if _crawl4ai_primary_web_enabled else 'DISABLED'}", ...] # Keep health info format
|
588 |
+
return PlainTextResponse("\n".join(info))
|
589 |
|
590 |
async def telegram_webhook(request: Request) -> Response:
|
591 |
+
global WEBHOOK_SECRET, ptb_app # Keep webhook logic
|
592 |
+
if not ptb_app or not ptb_app._initialized or not ptb_app.running: status = ...; logger.error(...); return PlainTextResponse(f'Bot {status}', 503)
|
593 |
+
if WEBHOOK_SECRET: ... # Keep secret check
|
594 |
+
try: data = await request.json(); update = Update.de_json(data, ptb_app.bot); logger.debug(...); await ptb_app.process_update(update); return Response(200)
|
595 |
except json.JSONDecodeError: logger.error(...); return PlainTextResponse('Bad Request', 400)
|
596 |
+
except Exception as e: logger.error(f"Webhook proc error: {e}", exc_info=True); return Response(200) # OK to TG
|
597 |
|
598 |
# --- ASGI App Definition ---
|
599 |
app = Starlette( debug=False, lifespan=lifespan, routes=[ Route("/", health_check), Route("/webhook", telegram_webhook, methods=["POST"]), ] )
|
|
|
602 |
# --- Development Runner ---
|
603 |
if __name__ == '__main__':
|
604 |
import uvicorn; logger.warning("Running DEV mode...") # Keep dev runner
|
605 |
+
log_level = ...; port = ...;
|
606 |
try: from dotenv import load_dotenv; load_dotenv(); logger.info(".env loaded.")
|
607 |
except ImportError: logger.info(".env not loaded.")
|
608 |
+
if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TOKEN missing.") # Keep checks
|
609 |
if not get_secret('GEMINI_API_KEY'): logger.error("Local Dev: GEMINI_API_KEY missing.")
|
610 |
+
uvicorn.run( "main:app", host='0.0.0.0', port=port, log_level=log_level, reload=True )
|