Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -53,12 +53,13 @@ except ImportError:
|
|
53 |
# --- Google Gemini ---
|
54 |
try:
|
55 |
import google.generativeai as genai
|
56 |
-
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
57 |
_gemini_available = True
|
58 |
except ImportError:
|
59 |
genai = None
|
60 |
HarmCategory = None
|
61 |
HarmBlockThreshold = None
|
|
|
62 |
_gemini_available = False
|
63 |
# logger defined later
|
64 |
|
@@ -111,7 +112,8 @@ GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer
|
|
111 |
# Models (User can still configure via env vars)
|
112 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
113 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
114 |
-
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-
|
|
|
115 |
|
116 |
# --- Configuration Checks ---
|
117 |
if not TELEGRAM_TOKEN: logger.critical("β FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
@@ -155,6 +157,10 @@ if _gemini_primary_enabled:
|
|
155 |
logger.error(f"Failed to configure Google GenAI client: {e}")
|
156 |
_gemini_primary_enabled = False
|
157 |
|
|
|
|
|
|
|
|
|
158 |
# --- Retry Decorator ---
|
159 |
# (Remains the same)
|
160 |
@retry(
|
@@ -333,12 +339,37 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
333 |
return None
|
334 |
|
335 |
logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
try:
|
337 |
-
# Use AsyncWebCrawler context manager
|
338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
# Use arun for a single URL crawl
|
340 |
-
# We primarily want the Markdown output as it's designed for LLMs
|
341 |
-
# Add a reasonable timeout
|
342 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
|
343 |
|
344 |
if result and result.markdown:
|
@@ -363,9 +394,15 @@ async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
|
|
363 |
except asyncio.TimeoutError:
|
364 |
logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
|
365 |
return None
|
|
|
|
|
|
|
366 |
except Exception as e:
|
367 |
-
|
368 |
-
|
|
|
|
|
|
|
369 |
return None
|
370 |
|
371 |
|
@@ -381,8 +418,8 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
381 |
response.raise_for_status()
|
382 |
content_type = response.headers.get('content-type', '').lower()
|
383 |
if 'html' not in content_type: logger.warning(f"[Web Scrape BS4] Non-HTML content type from {url}: {content_type}"); return None
|
384 |
-
try: return response.text
|
385 |
-
except Exception as e: logger.error(f"[Web Scrape BS4] Error
|
386 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape BS4] HTTP error {e.response.status_code} fetching {url}: {e}")
|
387 |
except httpx.TimeoutException: logger.error(f"[Web Scrape BS4] Timeout error fetching {url}")
|
388 |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape BS4] Too many redirects fetching {url}")
|
@@ -392,6 +429,7 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
392 |
|
393 |
async def get_website_content_bs4(url: str) -> Optional[str]:
|
394 |
"""Fetches and parses website content using BeautifulSoup (Fallback 1)."""
|
|
|
395 |
if not url: logger.error("[BS4 Fallback] get_website_content_bs4: No URL"); return None
|
396 |
logger.info(f"[BS4 Fallback] Attempting basic fetch & parse for: {url}")
|
397 |
html_content = await fetch_url_content_for_scrape(url)
|
@@ -404,20 +442,26 @@ async def get_website_content_bs4(url: str) -> Optional[str]:
|
|
404 |
def parse_html(content):
|
405 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
406 |
# Remove common non-content elements
|
407 |
-
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure"]):
|
408 |
element.extract()
|
409 |
-
# Try to find main content areas
|
410 |
-
|
411 |
-
target_element =
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
|
|
|
|
|
|
|
|
416 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
417 |
-
text = " ".join(lines)
|
418 |
-
|
419 |
-
|
420 |
-
|
|
|
|
|
421 |
return text
|
422 |
|
423 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
@@ -434,7 +478,7 @@ async def get_website_content_bs4(url: str) -> Optional[str]:
|
|
434 |
# Fallback 2: urltotext.com API
|
435 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
436 |
"""Fetches website content using urltotext.com API (Fallback 2)."""
|
437 |
-
# ... (Keep existing implementation
|
438 |
if not url: logger.error("[API Fallback] No URL"); return None
|
439 |
if not api_key: logger.error("[API Fallback] urltotext.com API key missing."); return None
|
440 |
logger.info(f"[API Fallback] Attempting fetch for: {url} using urltotext.com API")
|
@@ -455,83 +499,286 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
455 |
else: logger.warning(f"[API Fallback] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
|
456 |
except json.JSONDecodeError: logger.error(f"[API Fallback] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
|
457 |
except Exception as e: logger.error(f"[API Fallback] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
|
458 |
-
elif response.status_code
|
|
|
|
|
459 |
else: logger.error(f"[API Fallback] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
460 |
except httpx.TimeoutException: logger.error(f"[API Fallback] Timeout connecting to urltotext.com API for {url}"); return None
|
461 |
except httpx.RequestError as e: logger.error(f"[API Fallback] Request error connecting to urltotext.com API for {url}: {e}"); return None
|
462 |
except Exception as e: logger.error(f"[API Fallback] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
463 |
|
464 |
# --- Summarization Functions ---
|
465 |
-
# (_call_gemini, _call_openrouter, generate_summary remain the same)
|
466 |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
467 |
-
|
468 |
global GEMINI_MODEL, _gemini_primary_enabled
|
469 |
if not _gemini_primary_enabled:
|
470 |
logger.error("[Gemini Primary] Called but is disabled.");
|
471 |
return None, "Error: Primary AI service (Gemini) not configured/available."
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}")
|
473 |
-
# Define prompts (Keep existing prompts)
|
474 |
-
if summary_type == "paragraph": prompt = ("...") # Your existing paragraph prompt
|
475 |
-
else: prompt = ("...") # Your existing points prompt
|
476 |
-
# ... (rest of the Gemini call logic remains the same) ...
|
477 |
-
# Including length check, safety settings, API call, response handling
|
478 |
|
479 |
-
#
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
|
485 |
|
486 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
487 |
-
|
488 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
489 |
if not _openrouter_fallback_enabled:
|
490 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
491 |
return None, "Error: Fallback AI service (OpenRouter) not configured/available."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
|
493 |
-
# Define prompts (Keep existing prompts)
|
494 |
-
if summary_type == "paragraph": prompt = ("...") # Your existing paragraph prompt
|
495 |
-
else: prompt = ("...") # Your existing points prompt
|
496 |
-
# ... (rest of the OpenRouter call logic remains the same) ...
|
497 |
-
# Including length check, headers, payload, API call, response handling
|
498 |
|
499 |
-
#
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
|
505 |
|
506 |
async def generate_summary(text: str, summary_type: str) -> str:
|
507 |
-
|
508 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
509 |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
|
510 |
-
final_summary: Optional[str] = None
|
|
|
|
|
|
|
511 |
if _gemini_primary_enabled:
|
512 |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})")
|
513 |
-
|
514 |
-
if
|
515 |
-
|
|
|
|
|
|
|
|
|
516 |
else:
|
517 |
logger.warning("[Summary Generation] Primary AI (Gemini) disabled. Proceeding to fallback.")
|
518 |
error_message = "Primary AI (Gemini) unavailable."
|
519 |
|
|
|
520 |
if _openrouter_fallback_enabled:
|
521 |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})")
|
522 |
fallback_summary, fallback_error = await _call_openrouter(text, summary_type)
|
523 |
-
if fallback_summary:
|
|
|
|
|
524 |
else:
|
525 |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error}")
|
526 |
-
|
527 |
-
|
|
|
|
|
|
|
528 |
else:
|
529 |
logger.error("[Summary Generation] Fallback AI (OpenRouter) is disabled. Cannot proceed.")
|
530 |
-
if error_message:
|
531 |
-
|
|
|
|
|
532 |
|
533 |
-
|
534 |
-
|
|
|
|
|
535 |
|
536 |
|
537 |
# --- Main Processing Task ---
|
@@ -573,7 +820,8 @@ async def process_summary_task(
|
|
573 |
message_id=status_message_id,
|
574 |
text=processing_message_text,
|
575 |
parse_mode=ParseMode.HTML, # Use HTML for escaped URL
|
576 |
-
reply_markup=None
|
|
|
577 |
)
|
578 |
logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
579 |
except Exception as e:
|
@@ -586,17 +834,16 @@ async def process_summary_task(
|
|
586 |
bot.send_message,
|
587 |
chat_id=chat_id,
|
588 |
text=processing_message_text,
|
589 |
-
parse_mode=ParseMode.HTML # Use HTML for escaped URL
|
|
|
590 |
)
|
591 |
if status_message:
|
592 |
message_to_delete_later_id = status_message.message_id
|
593 |
logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
|
594 |
else:
|
595 |
-
# This should ideally be caught by retry_bot_operation raising an error
|
596 |
raise RuntimeError("Failed to send status message after retries.")
|
597 |
except Exception as e:
|
598 |
logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}")
|
599 |
-
# Don't raise here, try to continue if possible, but log critical failure
|
600 |
user_feedback_message = "Sorry, there was an issue starting the process."
|
601 |
# Attempt to send final feedback later if possible
|
602 |
|
@@ -626,7 +873,7 @@ async def process_summary_task(
|
|
626 |
logger.warning(f"[Task {task_id}] Crawl4AI failed for {url}. Attempting BeautifulSoup (Fallback 1)...")
|
627 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
628 |
except Exception: pass
|
629 |
-
content = await get_website_content_bs4(url)
|
630 |
|
631 |
if not content:
|
632 |
logger.warning(f"[Task {task_id}] BeautifulSoup also failed for {url}. Attempting API (Fallback 2)...")
|
@@ -636,11 +883,18 @@ async def process_summary_task(
|
|
636 |
except Exception: pass
|
637 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
638 |
if not content:
|
639 |
-
|
640 |
-
|
|
|
|
|
641 |
else:
|
642 |
logger.warning(f"[Task {task_id}] API fallback is disabled. Cannot attempt Fallback 2.")
|
643 |
-
user_feedback_message = "Sorry, I couldn't fetch content from that website using
|
|
|
|
|
|
|
|
|
|
|
644 |
|
645 |
|
646 |
# --- Generate Summary if Content was Fetched ---
|
@@ -656,73 +910,107 @@ async def process_summary_task(
|
|
656 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
657 |
else:
|
658 |
# Success - Send the summary
|
659 |
-
max_length = 4096 # Telegram's message length limit
|
660 |
summary_parts = []
|
661 |
current_part = ""
|
662 |
-
# Split respecting
|
663 |
-
|
664 |
-
|
665 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
else:
|
667 |
-
|
668 |
-
|
669 |
-
|
|
|
670 |
summary_parts.append(current_part.strip())
|
671 |
|
672 |
-
|
|
|
|
|
|
|
|
|
673 |
|
674 |
-
|
675 |
-
|
|
|
|
|
|
|
676 |
message_sent = False
|
677 |
-
|
|
|
678 |
try:
|
679 |
# Try editing the status message first
|
680 |
await retry_bot_operation(
|
681 |
bot.edit_message_text,
|
682 |
chat_id=chat_id,
|
683 |
-
message_id=
|
684 |
text=summary_parts[0],
|
685 |
parse_mode=None, # Send as plain text initially, safer
|
686 |
link_preview_options={'is_disabled': True}
|
687 |
)
|
688 |
-
logger.debug(f"[Task {task_id}] Edited message {
|
689 |
-
# Prevent this message from being deleted later
|
690 |
-
if message_to_delete_later_id ==
|
691 |
-
|
|
|
|
|
|
|
|
|
692 |
message_sent = True
|
693 |
except Exception as edit_err:
|
694 |
-
logger.warning(f"[Task {task_id}] Failed to edit message {
|
695 |
# If edit fails, fall through to send a new message
|
696 |
|
697 |
if not message_sent:
|
698 |
-
|
|
|
699 |
bot.send_message,
|
700 |
chat_id=chat_id,
|
701 |
text=summary_parts[0],
|
702 |
parse_mode=None,
|
703 |
link_preview_options={'is_disabled': True}
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
|
723 |
# --- Handle Cases Where No Content Was Fetched or Summary Failed ---
|
724 |
-
|
725 |
-
logger.warning(f"[Task {task_id}] Sending failure feedback to user: {user_feedback_message}")
|
726 |
try:
|
727 |
# Try editing the status message first
|
728 |
feedback_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
|
@@ -738,8 +1026,9 @@ async def process_summary_task(
|
|
738 |
reply_markup=None # Remove buttons
|
739 |
)
|
740 |
logger.debug(f"[Task {task_id}] Edited message {feedback_target_id} with failure feedback.")
|
|
|
741 |
if message_to_delete_later_id == feedback_target_id: message_to_delete_later_id = None
|
742 |
-
|
743 |
message_sent = True
|
744 |
except Exception as edit_err:
|
745 |
logger.warning(f"[Task {task_id}] Failed to edit message {feedback_target_id} with failure feedback: {edit_err}. Sending new message instead.")
|
@@ -759,27 +1048,34 @@ async def process_summary_task(
|
|
759 |
# Catch-all for unexpected errors during the main processing logic
|
760 |
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True)
|
761 |
user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later."
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
|
|
771 |
|
772 |
finally:
|
773 |
# --- Cleanup ---
|
774 |
-
# Delete the "Processing..."
|
775 |
-
|
776 |
-
if delete_target_id and bot:
|
777 |
try:
|
778 |
-
await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=
|
779 |
-
logger.debug(f"[Task {task_id}] Deleted
|
780 |
except Exception as del_e:
|
781 |
-
|
782 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
783 |
|
784 |
# Close the background bot's HTTP client
|
785 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
@@ -794,8 +1090,7 @@ async def process_summary_task(
|
|
794 |
|
795 |
# --- Telegram Handlers ---
|
796 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
|
797 |
-
# These
|
798 |
-
# The core logic change is within process_summary_task.
|
799 |
|
800 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
801 |
# ... (Keep existing implementation) ...
|
@@ -809,15 +1104,16 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
809 |
user = update.effective_user
|
810 |
if not user or not update.message: return
|
811 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
|
|
|
812 |
help_text = ( "π **How to use:**\n\n"
|
813 |
"1. Send me any YouTube video link or website URL.\n"
|
814 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
815 |
"3. Click the button for your choice.\n"
|
816 |
"4. Wait for the summary!\n\n"
|
817 |
"βοΈ **Behind the scenes:**\n"
|
818 |
-
"β’ **Websites:** I
|
819 |
-
"β’ **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if
|
820 |
-
"β’ **Summaries:** Generated using Google `
|
821 |
"**Commands:**\n"
|
822 |
"`/start` - Display welcome message\n"
|
823 |
"`/help` - Show this help message" )
|
@@ -830,10 +1126,9 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
830 |
if not user: return
|
831 |
# Basic URL validation
|
832 |
if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
|
833 |
-
# Maybe add a reply here? "Please send a valid URL starting with http:// or https://"
|
834 |
logger.debug(f"Ignoring non-URL from {user.id}: {url}")
|
835 |
# Optionally reply to the user that it doesn't look like a valid URL
|
836 |
-
|
837 |
return
|
838 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
|
839 |
# Store URL and original message ID in user_data
|
@@ -850,7 +1145,7 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
850 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
851 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
852 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'N/A'})")
|
853 |
-
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True)
|
854 |
|
855 |
url = context.user_data.get('url_to_summarize')
|
856 |
message_id_to_edit = query.message.message_id # The message with the buttons
|
@@ -859,31 +1154,27 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
859 |
if not url:
|
860 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Expired?")
|
861 |
try:
|
862 |
-
# Edit the message where the button was clicked
|
863 |
await query.edit_message_text(text="Sorry, I seem to have lost the context for that link. π€ Please send the URL again.", reply_markup=None)
|
864 |
except BadRequest as e:
|
865 |
if "message is not modified" in str(e).lower(): pass # Ignore if text is the same
|
866 |
else: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
867 |
-
except Exception as e:
|
868 |
-
|
869 |
-
# Do not proceed further
|
870 |
-
return
|
871 |
|
872 |
-
# Clear context *after* successfully
|
873 |
-
# context.user_data.pop('url_to_summarize', None)
|
874 |
-
# context.user_data.pop('original_message_id', None)
|
875 |
-
# logger.debug(f"Cleared URL context for user {user.id}") # Moved clearing to after task creation
|
876 |
|
877 |
# Check necessary configurations before scheduling
|
878 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
879 |
if not TELEGRAM_TOKEN:
|
880 |
logger.critical("TG TOKEN missing! Cannot schedule task.")
|
881 |
-
try: await query.edit_message_text(text="β Bot configuration error (Token Missing). Cannot proceed.")
|
882 |
except Exception: pass
|
883 |
return
|
884 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
885 |
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid! Cannot summarize.")
|
886 |
-
try: await query.edit_message_text(text="β AI configuration error: No summarization models are available. Cannot proceed.")
|
887 |
except Exception: pass
|
888 |
return
|
889 |
# Log warnings if one model is missing, but proceed if at least one is available
|
@@ -899,23 +1190,34 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
899 |
message_id_to_edit=message_id_to_edit, # Pass the ID of the message with buttons
|
900 |
url=url,
|
901 |
summary_type=summary_type,
|
902 |
-
bot_token=TELEGRAM_TOKEN
|
903 |
),
|
904 |
name=f"SummaryTask-{user.id}-{message_id_to_edit}"
|
905 |
)
|
906 |
|
907 |
-
# Clear context AFTER scheduling the task to prevent race conditions
|
908 |
context.user_data.pop('url_to_summarize', None)
|
909 |
context.user_data.pop('original_message_id', None)
|
910 |
logger.debug(f"Cleared URL context for user {user.id} after scheduling task.")
|
911 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
912 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
913 |
# ... (Keep existing implementation) ...
|
914 |
-
|
|
|
915 |
if isinstance(context.error, ignore_errors):
|
916 |
-
ignore_messages = ["message is not modified", "query is too old", "message to edit not found"]
|
917 |
-
|
918 |
-
|
|
|
919 |
return
|
920 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
921 |
# Consider notifying the user about unexpected errors if appropriate and possible
|
@@ -935,7 +1237,9 @@ async def setup_bot_config() -> Application:
|
|
935 |
# Add Handlers
|
936 |
application.add_handler(CommandHandler("start", start))
|
937 |
application.add_handler(CommandHandler("help", help_command))
|
938 |
-
|
|
|
|
|
939 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
940 |
# Error Handler
|
941 |
application.add_error_handler(error_handler)
|
@@ -1004,8 +1308,12 @@ async def lifespan(app: Starlette):
|
|
1004 |
logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True)
|
1005 |
raise RuntimeError(f"Failed to set webhook: {e}") from e
|
1006 |
else:
|
1007 |
-
|
1008 |
-
|
|
|
|
|
|
|
|
|
1009 |
|
1010 |
logger.info("ASGI Lifespan: Startup complete.");
|
1011 |
yield # Application runs here
|
@@ -1040,30 +1348,38 @@ async def lifespan(app: Starlette):
|
|
1040 |
|
1041 |
|
1042 |
async def health_check(request: Request) -> PlainTextResponse:
|
1043 |
-
# ... (Keep existing implementation,
|
1044 |
-
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled
|
1045 |
bot_status = "Not Initialized"
|
1046 |
bot_username = "N/A"
|
1047 |
-
if ptb_app and ptb_app.bot:
|
1048 |
try:
|
1049 |
-
|
1050 |
-
|
|
|
1051 |
bot_info = await ptb_app.bot.get_me()
|
1052 |
bot_username = f"@{bot_info.username}"
|
1053 |
-
bot_status = f"Running ({bot_username})"
|
|
|
|
|
1054 |
else: bot_status = "Initialized/Not running"
|
1055 |
except Exception as e: bot_status = f"Error checking status: {e}"
|
|
|
|
|
|
|
1056 |
|
1057 |
health_info = [
|
1058 |
-
f"
|
1059 |
-
f"
|
|
|
|
|
1060 |
f"Fallback Web Scraper 1: BeautifulSoup",
|
1061 |
-
f"Fallback Web Scraper 2: {'urltotext.com API' if _urltotext_fallback_enabled else '
|
1062 |
-
f"Primary Summarizer: {'Gemini (' + GEMINI_MODEL + ')' if _gemini_primary_enabled else '
|
1063 |
-
f"Fallback Summarizer: {'OpenRouter (' + OPENROUTER_MODEL + ')' if _openrouter_fallback_enabled else '
|
1064 |
f"Primary YT Transcript: youtube-transcript-api",
|
1065 |
-
f"Fallback YT Transcript 1: {'Supadata API' if SUPADATA_API_KEY else '
|
1066 |
-
f"Fallback YT Transcript 2: {'Apify (' + APIFY_ACTOR_ID + ')' if _apify_token_exists else '
|
1067 |
]
|
1068 |
return PlainTextResponse("\n".join(health_info))
|
1069 |
|
@@ -1124,6 +1440,18 @@ if __name__ == '__main__':
|
|
1124 |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
|
1125 |
# Use the PORT env var for local running too, defaulting to 8080
|
1126 |
local_port = int(os.environ.get('PORT', 8080))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1127 |
uvicorn.run(
|
1128 |
"main:app",
|
1129 |
host='0.0.0.0',
|
|
|
53 |
# --- Google Gemini ---
|
54 |
try:
|
55 |
import google.generativeai as genai
|
56 |
+
from google.generativeai.types import HarmCategory, HarmBlockThreshold, GenerateContentResponse
|
57 |
_gemini_available = True
|
58 |
except ImportError:
|
59 |
genai = None
|
60 |
HarmCategory = None
|
61 |
HarmBlockThreshold = None
|
62 |
+
GenerateContentResponse = None # Add this for type hinting if needed
|
63 |
_gemini_available = False
|
64 |
# logger defined later
|
65 |
|
|
|
112 |
# Models (User can still configure via env vars)
|
113 |
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Fallback Model
|
114 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
|
115 |
+
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-1.5-flash-latest") # Use the 1.5 flash model directly
|
116 |
+
# Using gemini-1.5-flash-latest is generally recommended over gemini-2.0-flash-001
|
117 |
|
118 |
# --- Configuration Checks ---
|
119 |
if not TELEGRAM_TOKEN: logger.critical("β FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
|
|
|
157 |
logger.error(f"Failed to configure Google GenAI client: {e}")
|
158 |
_gemini_primary_enabled = False
|
159 |
|
160 |
+
# --- Constants ---
|
161 |
+
MAX_SUMMARY_CHUNK_SIZE = 4000 # Max characters per Telegram message (allow buffer)
|
162 |
+
MAX_INPUT_TOKEN_APPROX = 1000000 # Gemini 1.5 Flash context window (approx chars) - adjust if needed
|
163 |
+
|
164 |
# --- Retry Decorator ---
|
165 |
# (Remains the same)
|
166 |
@retry(
|
|
|
339 |
return None
|
340 |
|
341 |
logger.info(f"[Crawl4AI Primary] Attempting to crawl URL: {url}")
|
342 |
+
# Define a writable cache directory (use /tmp in container environments)
|
343 |
+
# Create the directory path beforehand to avoid potential race conditions or permission issues within the library
|
344 |
+
cache_dir_path = "/tmp/.crawl4ai" # CHANGED: Use /tmp
|
345 |
+
try:
|
346 |
+
os.makedirs(cache_dir_path, exist_ok=True)
|
347 |
+
logger.info(f"[Crawl4AI Primary] Ensured cache directory exists: {cache_dir_path}")
|
348 |
+
except OSError as e:
|
349 |
+
logger.error(f"[Crawl4AI Primary] Failed to create cache directory {cache_dir_path}: {e}. Crawl may fail.")
|
350 |
+
# Don't return here, let the crawler try anyway, it might handle it internally or use default
|
351 |
+
except Exception as e:
|
352 |
+
logger.error(f"[Crawl4AI Primary] Unexpected error creating cache directory {cache_dir_path}: {e}")
|
353 |
+
|
354 |
+
|
355 |
try:
|
356 |
+
# Use AsyncWebCrawler context manager with explicit cache_dir
|
357 |
+
# NOTE: Pass cache_dir here if the library supports it via __init__ or a config object
|
358 |
+
# Checking crawl4ai docs/source, AsyncWebCrawler doesn't directly take cache_dir in __init__.
|
359 |
+
# It seems to rely on environment variables or default home resolution.
|
360 |
+
# The PermissionError happens in RobotsParser -> get_home_folder -> os.makedirs.
|
361 |
+
# WORKAROUND: We might need to adjust the environment or hope setting HOME=/app in Dockerfile is enough
|
362 |
+
# *if* the library correctly uses HOME. Let's test *without* explicit cache_dir first,
|
363 |
+
# relying on HOME=/app and the prior os.makedirs call. If it still fails, we need a different approach.
|
364 |
+
|
365 |
+
# UPDATE: The traceback shows it uses utils.get_home_folder(). Let's stick with HOME=/app for now
|
366 |
+
# and see if the permission error was transient or specific to the '.models' subdir.
|
367 |
+
# If it persists, we might need to fork/modify crawl4ai or find another way to configure its paths.
|
368 |
+
|
369 |
+
# Let's *try* passing cache_dir anyway, maybe it's an undocumented/newer feature
|
370 |
+
async with AsyncWebCrawler(cache_dir=cache_dir_path) as crawler: # TRY passing cache_dir
|
371 |
+
logger.info(f"[Crawl4AI Primary] Initialized with explicit cache_dir: {cache_dir_path}")
|
372 |
# Use arun for a single URL crawl
|
|
|
|
|
373 |
result = await crawler.arun(url=url, crawler_strategy="playwright", timeout=90) # 90 sec timeout
|
374 |
|
375 |
if result and result.markdown:
|
|
|
394 |
except asyncio.TimeoutError:
|
395 |
logger.error(f"[Crawl4AI Primary] Timeout occurred while crawling {url}")
|
396 |
return None
|
397 |
+
except PermissionError as e: # Catch the specific error
|
398 |
+
logger.error(f"[Crawl4AI Primary] Permission denied during crawl for {url}. Likely filesystem issue in container. Error: {e}", exc_info=True)
|
399 |
+
return None # Fail gracefully for this method
|
400 |
except Exception as e:
|
401 |
+
# Log type error if cache_dir isn't accepted
|
402 |
+
if "unexpected keyword argument 'cache_dir'" in str(e):
|
403 |
+
logger.error(f"[Crawl4AI Primary] AsyncWebCrawler does not accept 'cache_dir'. Remove this argument. Error: {e}")
|
404 |
+
else:
|
405 |
+
logger.error(f"[Crawl4AI Primary] Unexpected error during crawl for {url}: {e}", exc_info=True)
|
406 |
return None
|
407 |
|
408 |
|
|
|
418 |
response.raise_for_status()
|
419 |
content_type = response.headers.get('content-type', '').lower()
|
420 |
if 'html' not in content_type: logger.warning(f"[Web Scrape BS4] Non-HTML content type from {url}: {content_type}"); return None
|
421 |
+
try: return response.text # Use response.text to let httpx handle decoding
|
422 |
+
except Exception as e: logger.error(f"[Web Scrape BS4] Error getting response text for {url}: {e}"); return None
|
423 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape BS4] HTTP error {e.response.status_code} fetching {url}: {e}")
|
424 |
except httpx.TimeoutException: logger.error(f"[Web Scrape BS4] Timeout error fetching {url}")
|
425 |
except httpx.TooManyRedirects: logger.error(f"[Web Scrape BS4] Too many redirects fetching {url}")
|
|
|
429 |
|
430 |
async def get_website_content_bs4(url: str) -> Optional[str]:
|
431 |
"""Fetches and parses website content using BeautifulSoup (Fallback 1)."""
|
432 |
+
# ... (Keep existing implementation) ...
|
433 |
if not url: logger.error("[BS4 Fallback] get_website_content_bs4: No URL"); return None
|
434 |
logger.info(f"[BS4 Fallback] Attempting basic fetch & parse for: {url}")
|
435 |
html_content = await fetch_url_content_for_scrape(url)
|
|
|
442 |
def parse_html(content):
|
443 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
444 |
# Remove common non-content elements
|
445 |
+
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "video", "audio"]):
|
446 |
element.extract()
|
447 |
+
# Try to find main content areas more broadly
|
448 |
+
selectors = ['main', 'article', '[role="main"]', '#content', '.content', '#main-content', '.main-content', '#body', '.body', '#article-body', '.article-body']
|
449 |
+
target_element = None
|
450 |
+
for selector in selectors:
|
451 |
+
target_element = soup.select_one(selector)
|
452 |
+
if target_element: break
|
453 |
+
|
454 |
+
if not target_element: target_element = soup.body # Fallback to body
|
455 |
+
if not target_element: logger.warning(f"[BS4 Fallback] Could not find body/main for parsing {url}"); return None
|
456 |
+
|
457 |
+
# Extract text, clean up whitespace aggressively
|
458 |
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
|
459 |
+
text = " ".join(lines) # Join lines with spaces
|
460 |
+
|
461 |
+
# Basic post-cleaning
|
462 |
+
text = re.sub(r'\s{2,}', ' ', text).strip() # Replace multiple spaces with single space
|
463 |
+
|
464 |
+
if not text: logger.warning(f"[BS4 Fallback] Extracted text is empty after cleaning for {url}"); return None
|
465 |
return text
|
466 |
|
467 |
text_content = await asyncio.to_thread(parse_html, html_content)
|
|
|
478 |
# Fallback 2: urltotext.com API
|
479 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
480 |
"""Fetches website content using urltotext.com API (Fallback 2)."""
|
481 |
+
# ... (Keep existing implementation) ...
|
482 |
if not url: logger.error("[API Fallback] No URL"); return None
|
483 |
if not api_key: logger.error("[API Fallback] urltotext.com API key missing."); return None
|
484 |
logger.info(f"[API Fallback] Attempting fetch for: {url} using urltotext.com API")
|
|
|
499 |
else: logger.warning(f"[API Fallback] urltotext.com API success but content empty for {url}. Resp: {data}"); return None
|
500 |
except json.JSONDecodeError: logger.error(f"[API Fallback] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
|
501 |
except Exception as e: logger.error(f"[API Fallback] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
|
502 |
+
elif response.status_code == 402: # Specifically handle insufficient credits
|
503 |
+
logger.error(f"[API Fallback] Error 402 (Insufficient Credits) from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
504 |
+
elif response.status_code in [400, 401, 403, 422, 500]: logger.error(f"[API Fallback] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
505 |
else: logger.error(f"[API Fallback] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
|
506 |
except httpx.TimeoutException: logger.error(f"[API Fallback] Timeout connecting to urltotext.com API for {url}"); return None
|
507 |
except httpx.RequestError as e: logger.error(f"[API Fallback] Request error connecting to urltotext.com API for {url}: {e}"); return None
|
508 |
except Exception as e: logger.error(f"[API Fallback] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
509 |
|
510 |
# --- Summarization Functions ---
|
|
|
511 |
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
512 |
+
""" Calls the Google Gemini API to generate a summary. """
|
513 |
global GEMINI_MODEL, _gemini_primary_enabled
|
514 |
if not _gemini_primary_enabled:
|
515 |
logger.error("[Gemini Primary] Called but is disabled.");
|
516 |
return None, "Error: Primary AI service (Gemini) not configured/available."
|
517 |
+
|
518 |
+
# Truncate input text if it exceeds the approximate limit
|
519 |
+
if len(text) > MAX_INPUT_TOKEN_APPROX:
|
520 |
+
logger.warning(f"[Gemini Primary] Input text length ({len(text)}) exceeds limit ({MAX_INPUT_TOKEN_APPROX}). Truncating.")
|
521 |
+
text = text[:MAX_INPUT_TOKEN_APPROX]
|
522 |
+
|
523 |
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}")
|
|
|
|
|
|
|
|
|
|
|
524 |
|
525 |
+
# Define prompts
|
526 |
+
if summary_type == "paragraph":
|
527 |
+
prompt = f"""Please summarise the following text into a concise paragraph. Focus on the main points and key information. Avoid unnecessary jargon or overly complex sentences.
|
528 |
+
|
529 |
+
Text to summarise:
|
530 |
+
---
|
531 |
+
{text}
|
532 |
+
---
|
533 |
+
|
534 |
+
Concise Paragraph Summary:"""
|
535 |
+
elif summary_type == "points":
|
536 |
+
prompt = f"""Please summarise the following text into a list of key bullet points. Each point should capture a distinct main idea or important piece of information. Aim for clarity and conciseness.
|
537 |
+
|
538 |
+
Text to summarise:
|
539 |
+
---
|
540 |
+
{text}
|
541 |
+
---
|
542 |
+
|
543 |
+
Key Bullet Points Summary:"""
|
544 |
+
else:
|
545 |
+
logger.error(f"[Gemini Primary] Invalid summary_type: {summary_type}")
|
546 |
+
return None, f"Error: Invalid summary type '{summary_type}' specified."
|
547 |
+
|
548 |
+
# Configure safety settings (adjust as needed)
|
549 |
+
safety_settings = {
|
550 |
+
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
551 |
+
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
552 |
+
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
553 |
+
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
554 |
+
}
|
555 |
+
|
556 |
+
# Configure generation settings (optional)
|
557 |
+
generation_config = genai.types.GenerationConfig(
|
558 |
+
# candidate_count=1, # Default is 1
|
559 |
+
# stop_sequences=["\n"],
|
560 |
+
max_output_tokens=2048, # Increased max tokens for potentially longer summaries from large inputs
|
561 |
+
temperature=0.7, # Adjust creativity vs factualness
|
562 |
+
# top_p=1.0, # Default
|
563 |
+
# top_k=None # Default
|
564 |
+
)
|
565 |
+
|
566 |
+
try:
|
567 |
+
model = genai.GenerativeModel(GEMINI_MODEL)
|
568 |
+
logger.debug(f"[Gemini Primary] Sending request to model {GEMINI_MODEL}")
|
569 |
+
response: GenerateContentResponse = await model.generate_content_async( # Use async version
|
570 |
+
prompt,
|
571 |
+
generation_config=generation_config,
|
572 |
+
safety_settings=safety_settings
|
573 |
+
)
|
574 |
+
logger.debug(f"[Gemini Primary] Received response. Finish reason: {response.candidates[0].finish_reason if response.candidates else 'N/A'}")
|
575 |
+
|
576 |
+
# Check for safety blocks or other issues in response
|
577 |
+
if not response.candidates:
|
578 |
+
block_reason = response.prompt_feedback.block_reason if hasattr(response, 'prompt_feedback') else 'Unknown'
|
579 |
+
error_msg = f"Error: Gemini response blocked or empty. Reason: {block_reason}"
|
580 |
+
logger.error(f"[Gemini Primary] {error_msg}")
|
581 |
+
return None, error_msg
|
582 |
+
|
583 |
+
# Check finish reason (e.g., MAX_TOKENS, SAFETY)
|
584 |
+
finish_reason = response.candidates[0].finish_reason
|
585 |
+
if finish_reason != genai.types.FinishReason.STOP and finish_reason != genai.types.FinishReason.MAX_TOKENS:
|
586 |
+
# Log safety ratings if available
|
587 |
+
safety_ratings_str = "N/A"
|
588 |
+
if hasattr(response.candidates[0], 'safety_ratings'):
|
589 |
+
safety_ratings_str = ', '.join([f"{r.category.name}: {r.probability.name}" for r in response.candidates[0].safety_ratings])
|
590 |
+
error_msg = f"Error: Gemini generation finished unexpectedly. Reason: {finish_reason.name}. Safety: {safety_ratings_str}"
|
591 |
+
logger.error(f"[Gemini Primary] {error_msg}")
|
592 |
+
# Return partial text if available and finish reason is MAX_TOKENS? Maybe not, could be truncated badly.
|
593 |
+
# If SAFETY, definitely return error.
|
594 |
+
if finish_reason == genai.types.FinishReason.SAFETY:
|
595 |
+
return None, error_msg # Return specific error for safety blocks
|
596 |
+
# For other reasons, maybe return partial, but safer to return error for now
|
597 |
+
# return response.text if hasattr(response, 'text') else None, error_msg # Optional: return partial text for RECITATION/OTHER
|
598 |
+
return None, f"Error: Gemini generation finished unexpectedly ({finish_reason.name})."
|
599 |
+
|
600 |
+
|
601 |
+
# Extract text
|
602 |
+
summary_text = response.text
|
603 |
+
if not summary_text or not summary_text.strip():
|
604 |
+
logger.warning("[Gemini Primary] Gemini returned an empty summary.")
|
605 |
+
return None, "Error: AI generated an empty summary."
|
606 |
+
|
607 |
+
logger.info(f"[Gemini Primary] Summary generated successfully (len: {len(summary_text)}).")
|
608 |
+
return summary_text.strip(), None
|
609 |
+
|
610 |
+
except Exception as e:
|
611 |
+
logger.error(f"[Gemini Primary] Error during API call to {GEMINI_MODEL}: {e}", exc_info=True)
|
612 |
+
# Check for specific Google API errors if needed
|
613 |
+
# from google.api_core import exceptions as google_exceptions
|
614 |
+
# if isinstance(e, google_exceptions.GoogleAPIError): ...
|
615 |
+
return None, f"Error: Failed to communicate with the primary AI service (Gemini). Details: {e}"
|
616 |
|
617 |
|
618 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
619 |
+
""" Calls the OpenRouter API to generate a summary. """
|
620 |
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
|
621 |
if not _openrouter_fallback_enabled:
|
622 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
623 |
return None, "Error: Fallback AI service (OpenRouter) not configured/available."
|
624 |
+
|
625 |
+
# OpenRouter models might have smaller context windows, truncate more aggressively if needed
|
626 |
+
# Example: 32k tokens ~ 120k chars. Deepseek is large though. Check model specifics if issues arise.
|
627 |
+
max_input_len_openrouter = 100000 # Adjust based on OPENROUTER_MODEL limits if known
|
628 |
+
if len(text) > max_input_len_openrouter:
|
629 |
+
logger.warning(f"[OpenRouter Fallback] Input text length ({len(text)}) exceeds approx limit ({max_input_len_openrouter}) for {OPENROUTER_MODEL}. Truncating.")
|
630 |
+
text = text[:max_input_len_openrouter]
|
631 |
+
|
632 |
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
|
|
|
|
|
|
|
|
|
|
|
633 |
|
634 |
+
# Define prompts (similar structure to Gemini)
|
635 |
+
if summary_type == "paragraph":
|
636 |
+
prompt_content = f"""Please summarise the following text into a concise paragraph. Focus on the main points and key information.
|
637 |
+
|
638 |
+
Text:
|
639 |
+
---
|
640 |
+
{text}
|
641 |
+
---
|
642 |
+
|
643 |
+
Concise Paragraph Summary:"""
|
644 |
+
elif summary_type == "points":
|
645 |
+
prompt_content = f"""Please summarise the following text into a list of key bullet points. Each point should capture a distinct main idea.
|
646 |
+
|
647 |
+
Text:
|
648 |
+
---
|
649 |
+
{text}
|
650 |
+
---
|
651 |
+
|
652 |
+
Key Bullet Points Summary:"""
|
653 |
+
else:
|
654 |
+
logger.error(f"[OpenRouter Fallback] Invalid summary_type: {summary_type}")
|
655 |
+
return None, f"Error: Invalid summary type '{summary_type}' specified."
|
656 |
+
|
657 |
+
headers = {
|
658 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
659 |
+
"Content-Type": "application/json",
|
660 |
+
"HTTP-Referer": "https://github.com/fmab777/telegram-summary-bot", # Optional: Identify your app
|
661 |
+
"X-Title": "Telegram Summary Bot", # Optional: Identify your app
|
662 |
+
}
|
663 |
+
payload = {
|
664 |
+
"model": OPENROUTER_MODEL,
|
665 |
+
"messages": [
|
666 |
+
{"role": "system", "content": "You are an expert summarizer. Provide summaries as requested."},
|
667 |
+
{"role": "user", "content": prompt_content}
|
668 |
+
],
|
669 |
+
"max_tokens": 2048, # Adjust as needed
|
670 |
+
"temperature": 0.7,
|
671 |
+
}
|
672 |
+
|
673 |
+
api_url = "https://openrouter.ai/api/v1/chat/completions"
|
674 |
+
|
675 |
+
try:
|
676 |
+
async with httpx.AsyncClient(timeout=120.0) as client: # Longer timeout for potentially slower models
|
677 |
+
logger.debug(f"[OpenRouter Fallback] Sending request to {api_url} for model {OPENROUTER_MODEL}")
|
678 |
+
response = await client.post(api_url, headers=headers, json=payload)
|
679 |
+
logger.debug(f"[OpenRouter Fallback] Received status code {response.status_code}")
|
680 |
+
|
681 |
+
if response.status_code == 200:
|
682 |
+
try:
|
683 |
+
data = response.json()
|
684 |
+
if data.get("choices") and len(data["choices"]) > 0:
|
685 |
+
message = data["choices"][0].get("message")
|
686 |
+
if message and message.get("content"):
|
687 |
+
summary_text = message["content"].strip()
|
688 |
+
if summary_text:
|
689 |
+
finish_reason = data["choices"][0].get("finish_reason", "N/A")
|
690 |
+
logger.info(f"[OpenRouter Fallback] Summary generated successfully (len: {len(summary_text)}). Finish: {finish_reason}")
|
691 |
+
# Check for length finish reason?
|
692 |
+
if finish_reason == 'length':
|
693 |
+
logger.warning("[OpenRouter Fallback] Summary may be truncated due to max_tokens limit.")
|
694 |
+
return summary_text, None
|
695 |
+
else:
|
696 |
+
logger.warning("[OpenRouter Fallback] OpenRouter returned an empty summary content.")
|
697 |
+
return None, "Error: Fallback AI generated an empty summary."
|
698 |
+
else:
|
699 |
+
logger.error(f"[OpenRouter Fallback] Invalid response structure (missing message/content). Data: {data}")
|
700 |
+
return None, "Error: Fallback AI returned an invalid response format."
|
701 |
+
else:
|
702 |
+
logger.error(f"[OpenRouter Fallback] Invalid response structure (missing choices). Data: {data}")
|
703 |
+
# Check for error object in response
|
704 |
+
api_error = data.get("error", {}).get("message", "Unknown API error")
|
705 |
+
return None, f"Error: Fallback AI response missing summary. API msg: {api_error}"
|
706 |
+
|
707 |
+
except json.JSONDecodeError:
|
708 |
+
logger.error(f"[OpenRouter Fallback] Failed to decode JSON response. Status: {response.status_code}, Text: {response.text[:500]}")
|
709 |
+
return None, "Error: Fallback AI sent an invalid JSON response."
|
710 |
+
except Exception as e:
|
711 |
+
logger.error(f"[OpenRouter Fallback] Error processing success response: {e}", exc_info=True)
|
712 |
+
return None, f"Error: Failed to process Fallback AI response. Details: {e}"
|
713 |
+
|
714 |
+
else:
|
715 |
+
# Handle API errors (rate limits, auth, etc.)
|
716 |
+
error_message = f"Error: Fallback AI service ({OPENROUTER_MODEL}) returned status {response.status_code}."
|
717 |
+
try:
|
718 |
+
error_details = response.json().get("error", {}).get("message", response.text[:200])
|
719 |
+
error_message += f" Details: {error_details}"
|
720 |
+
except Exception:
|
721 |
+
error_message += f" Response: {response.text[:200]}"
|
722 |
+
logger.error(f"[OpenRouter Fallback] {error_message}")
|
723 |
+
return None, error_message
|
724 |
+
|
725 |
+
except httpx.TimeoutException:
|
726 |
+
logger.error(f"[OpenRouter Fallback] Timeout connecting to OpenRouter API for {OPENROUTER_MODEL}")
|
727 |
+
return None, "Error: Timed out connecting to the fallback AI service."
|
728 |
+
except httpx.RequestError as e:
|
729 |
+
logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}")
|
730 |
+
return None, f"Error: Network error connecting to the fallback AI service. Details: {e}"
|
731 |
+
except Exception as e:
|
732 |
+
logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter API call: {e}", exc_info=True)
|
733 |
+
return None, f"Error: Unexpected issue with the fallback AI service. Details: {e}"
|
734 |
|
735 |
|
736 |
async def generate_summary(text: str, summary_type: str) -> str:
|
737 |
+
""" Generates a summary using the primary AI (Gemini) and falling back to OpenRouter. """
|
738 |
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
|
739 |
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
|
740 |
+
final_summary: Optional[str] = None
|
741 |
+
error_message: Optional[str] = None # Accumulates errors
|
742 |
+
|
743 |
+
# --- Attempt Primary AI (Gemini) ---
|
744 |
if _gemini_primary_enabled:
|
745 |
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})")
|
746 |
+
primary_summary, primary_error = await _call_gemini(text, summary_type)
|
747 |
+
if primary_summary:
|
748 |
+
logger.info(f"[Summary Generation] Success with primary AI (Gemini).")
|
749 |
+
return primary_summary # Return successful primary summary immediately
|
750 |
+
else:
|
751 |
+
logger.warning(f"[Summary Generation] Primary AI (Gemini) failed. Error: {primary_error}. Proceeding to fallback.")
|
752 |
+
error_message = f"Primary AI ({GEMINI_MODEL}) failed: {primary_error}" # Store primary error
|
753 |
else:
|
754 |
logger.warning("[Summary Generation] Primary AI (Gemini) disabled. Proceeding to fallback.")
|
755 |
error_message = "Primary AI (Gemini) unavailable."
|
756 |
|
757 |
+
# --- Attempt Fallback AI (OpenRouter) ---
|
758 |
if _openrouter_fallback_enabled:
|
759 |
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})")
|
760 |
fallback_summary, fallback_error = await _call_openrouter(text, summary_type)
|
761 |
+
if fallback_summary:
|
762 |
+
logger.info(f"[Summary Generation] Success with fallback AI (OpenRouter).")
|
763 |
+
return fallback_summary # Return successful fallback summary
|
764 |
else:
|
765 |
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error}")
|
766 |
+
# Combine errors for final message
|
767 |
+
if error_message: # If primary also failed
|
768 |
+
return f"{error_message}\nFallback AI ({OPENROUTER_MODEL}) also failed: {fallback_error}"
|
769 |
+
else: # Should not happen if logic is correct, but fallback just in case
|
770 |
+
return f"Fallback AI ({OPENROUTER_MODEL}) failed: {fallback_error}"
|
771 |
else:
|
772 |
logger.error("[Summary Generation] Fallback AI (OpenRouter) is disabled. Cannot proceed.")
|
773 |
+
if error_message: # Primary failed AND fallback disabled
|
774 |
+
return f"{error_message}\nFallback AI is also unavailable."
|
775 |
+
else: # Primary disabled AND fallback disabled
|
776 |
+
return "Error: Both primary and fallback AI services are unavailable."
|
777 |
|
778 |
+
# This part should ideally not be reached if the logic above is sound
|
779 |
+
logger.error("[Summary Generation] Reached end of function unexpectedly. No summary generated.")
|
780 |
+
final_error = error_message or "Unknown summary generation error."
|
781 |
+
return f"Sorry, an error occurred: {final_error}"
|
782 |
|
783 |
|
784 |
# --- Main Processing Task ---
|
|
|
820 |
message_id=status_message_id,
|
821 |
text=processing_message_text,
|
822 |
parse_mode=ParseMode.HTML, # Use HTML for escaped URL
|
823 |
+
reply_markup=None,
|
824 |
+
link_preview_options={'is_disabled': True} # Disable preview here too
|
825 |
)
|
826 |
logger.debug(f"[Task {task_id}] Edited message {status_message_id} to 'Processing'")
|
827 |
except Exception as e:
|
|
|
834 |
bot.send_message,
|
835 |
chat_id=chat_id,
|
836 |
text=processing_message_text,
|
837 |
+
parse_mode=ParseMode.HTML, # Use HTML for escaped URL
|
838 |
+
link_preview_options={'is_disabled': True}
|
839 |
)
|
840 |
if status_message:
|
841 |
message_to_delete_later_id = status_message.message_id
|
842 |
logger.debug(f"[Task {task_id}] Sent new status message {message_to_delete_later_id}")
|
843 |
else:
|
|
|
844 |
raise RuntimeError("Failed to send status message after retries.")
|
845 |
except Exception as e:
|
846 |
logger.error(f"[Task {task_id}] CRITICAL: Failed to send new status message: {e}")
|
|
|
847 |
user_feedback_message = "Sorry, there was an issue starting the process."
|
848 |
# Attempt to send final feedback later if possible
|
849 |
|
|
|
873 |
logger.warning(f"[Task {task_id}] Crawl4AI failed for {url}. Attempting BeautifulSoup (Fallback 1)...")
|
874 |
try: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
875 |
except Exception: pass
|
876 |
+
content = await get_website_content_bs4(url)
|
877 |
|
878 |
if not content:
|
879 |
logger.warning(f"[Task {task_id}] BeautifulSoup also failed for {url}. Attempting API (Fallback 2)...")
|
|
|
883 |
except Exception: pass
|
884 |
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
885 |
if not content:
|
886 |
+
# Check if the specific error was insufficient credits
|
887 |
+
# Note: get_website_content_via_api already logs the specific error
|
888 |
+
logger.error(f"[Task {task_id}] API fallback (urltotext) also failed for {url}.")
|
889 |
+
user_feedback_message = "Sorry, I couldn't fetch content from that website using any method (Crawl4AI/BS4 failed, API failed or ran out of credits)." # Updated message
|
890 |
else:
|
891 |
logger.warning(f"[Task {task_id}] API fallback is disabled. Cannot attempt Fallback 2.")
|
892 |
+
user_feedback_message = "Sorry, I couldn't fetch content from that website using Crawl4AI or BeautifulSoup, and the API fallback is not enabled." # Updated message
|
893 |
+
|
894 |
+
# Final check if all web methods failed
|
895 |
+
if not content and not user_feedback_message:
|
896 |
+
logger.error(f"[Task {task_id}] All website fetching methods seem to have failed without setting a specific user message.")
|
897 |
+
user_feedback_message = "Sorry, I couldn't fetch content from that website using any available method (blocked/inaccessible/empty?)."
|
898 |
|
899 |
|
900 |
# --- Generate Summary if Content was Fetched ---
|
|
|
910 |
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
|
911 |
else:
|
912 |
# Success - Send the summary
|
|
|
913 |
summary_parts = []
|
914 |
current_part = ""
|
915 |
+
# Split respecting newlines, ensure no part exceeds MAX_SUMMARY_CHUNK_SIZE
|
916 |
+
lines = final_summary.splitlines(keepends=True)
|
917 |
+
for line in lines:
|
918 |
+
# If adding the next line exceeds the limit, finalize the current part
|
919 |
+
if len(current_part) + len(line) > MAX_SUMMARY_CHUNK_SIZE:
|
920 |
+
if current_part.strip(): # Don't add empty parts
|
921 |
+
summary_parts.append(current_part.strip())
|
922 |
+
current_part = line # Start new part with the current line
|
923 |
+
# If a single line itself is too long, truncate it (edge case)
|
924 |
+
if len(current_part) > MAX_SUMMARY_CHUNK_SIZE:
|
925 |
+
logger.warning(f"[Task {task_id}] Truncating overly long line in summary.")
|
926 |
+
current_part = current_part[:MAX_SUMMARY_CHUNK_SIZE]
|
927 |
else:
|
928 |
+
current_part += line
|
929 |
+
|
930 |
+
# Add the last part if it has content
|
931 |
+
if current_part.strip():
|
932 |
summary_parts.append(current_part.strip())
|
933 |
|
934 |
+
# If somehow splitting resulted in nothing (e.g., empty summary initially?)
|
935 |
+
if not summary_parts:
|
936 |
+
summary_parts.append("Summary generated, but it appears to be empty.")
|
937 |
+
logger.warning(f"[Task {task_id}] Summary was non-empty initially but splitting resulted in zero parts.")
|
938 |
+
|
939 |
|
940 |
+
logger.info(f"[Task {task_id}] Summary generated (orig len: {len(final_summary)}). Sending in {len(summary_parts)} part(s).")
|
941 |
+
|
942 |
+
# Determine the target message ID for the *first* part
|
943 |
+
# Prefer editing the "Processing..." message if we sent a new one
|
944 |
+
edit_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
|
945 |
message_sent = False
|
946 |
+
|
947 |
+
if edit_target_id:
|
948 |
try:
|
949 |
# Try editing the status message first
|
950 |
await retry_bot_operation(
|
951 |
bot.edit_message_text,
|
952 |
chat_id=chat_id,
|
953 |
+
message_id=edit_target_id,
|
954 |
text=summary_parts[0],
|
955 |
parse_mode=None, # Send as plain text initially, safer
|
956 |
link_preview_options={'is_disabled': True}
|
957 |
)
|
958 |
+
logger.debug(f"[Task {task_id}] Edited message {edit_target_id} with first summary part.")
|
959 |
+
# Prevent this message from being deleted later if it was the 'Processing...' one
|
960 |
+
if message_to_delete_later_id == edit_target_id: message_to_delete_later_id = None
|
961 |
+
# If it was the *original* button message that we are editing, keep status_message_id
|
962 |
+
# so we know *not* to delete it in finally block if it's the only message left.
|
963 |
+
# However, it's clearer to just prevent deletion if edited.
|
964 |
+
if status_message_id == edit_target_id: status_message_id = None # Mark as handled
|
965 |
+
|
966 |
message_sent = True
|
967 |
except Exception as edit_err:
|
968 |
+
logger.warning(f"[Task {task_id}] Failed to edit message {edit_target_id} with summary: {edit_err}. Sending new message instead.")
|
969 |
# If edit fails, fall through to send a new message
|
970 |
|
971 |
if not message_sent:
|
972 |
+
# Send the first part as a new message
|
973 |
+
sent_msg = await retry_bot_operation(
|
974 |
bot.send_message,
|
975 |
chat_id=chat_id,
|
976 |
text=summary_parts[0],
|
977 |
parse_mode=None,
|
978 |
link_preview_options={'is_disabled': True}
|
979 |
+
)
|
980 |
+
if sent_msg:
|
981 |
+
logger.debug(f"[Task {task_id}] Sent first summary part as new message {sent_msg.message_id}.")
|
982 |
+
else: # Should be caught by retry, but log defensively
|
983 |
+
logger.error(f"[Task {task_id}] Failed to send first summary part even as new message.")
|
984 |
+
user_feedback_message = "Sorry, failed to send the summary." # Set error
|
985 |
+
|
986 |
+
|
987 |
+
# Send remaining parts (if any and first part succeeded)
|
988 |
+
if not user_feedback_message and len(summary_parts) > 1:
|
989 |
+
for i, part in enumerate(summary_parts[1:], start=2):
|
990 |
+
await asyncio.sleep(0.5) # Small delay between parts
|
991 |
+
try:
|
992 |
+
await retry_bot_operation(
|
993 |
+
bot.send_message,
|
994 |
+
chat_id=chat_id,
|
995 |
+
text=part,
|
996 |
+
parse_mode=None,
|
997 |
+
link_preview_options={'is_disabled': True}
|
998 |
+
)
|
999 |
+
logger.debug(f"[Task {task_id}] Sent summary part {i}/{len(summary_parts)}.")
|
1000 |
+
except Exception as part_err:
|
1001 |
+
logger.error(f"[Task {task_id}] Failed to send summary part {i}: {part_err}")
|
1002 |
+
user_feedback_message = f"Sorry, failed to send part {i} of the summary."
|
1003 |
+
# Should we stop sending further parts? Yes.
|
1004 |
+
break # Stop sending remaining parts
|
1005 |
+
|
1006 |
+
# Determine overall success based on whether feedback message is set
|
1007 |
+
if not user_feedback_message:
|
1008 |
+
success = True
|
1009 |
+
# user_feedback_message = None # Clear feedback message ONLY on full success
|
1010 |
|
1011 |
# --- Handle Cases Where No Content Was Fetched or Summary Failed ---
|
1012 |
+
if user_feedback_message: # Check if any error occurred
|
1013 |
+
logger.warning(f"[Task {task_id}] Sending failure/error feedback to user: {user_feedback_message}")
|
1014 |
try:
|
1015 |
# Try editing the status message first
|
1016 |
feedback_target_id = message_to_delete_later_id if message_to_delete_later_id else status_message_id
|
|
|
1026 |
reply_markup=None # Remove buttons
|
1027 |
)
|
1028 |
logger.debug(f"[Task {task_id}] Edited message {feedback_target_id} with failure feedback.")
|
1029 |
+
# Prevent deletion if edited
|
1030 |
if message_to_delete_later_id == feedback_target_id: message_to_delete_later_id = None
|
1031 |
+
if status_message_id == feedback_target_id: status_message_id = None
|
1032 |
message_sent = True
|
1033 |
except Exception as edit_err:
|
1034 |
logger.warning(f"[Task {task_id}] Failed to edit message {feedback_target_id} with failure feedback: {edit_err}. Sending new message instead.")
|
|
|
1048 |
# Catch-all for unexpected errors during the main processing logic
|
1049 |
logger.error(f"[Task {task_id}] Unexpected error during processing: {e}", exc_info=True)
|
1050 |
user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later."
|
1051 |
+
if bot: # Ensure bot exists before trying to send
|
1052 |
+
try:
|
1053 |
+
# Attempt to send a final error message
|
1054 |
+
await retry_bot_operation(
|
1055 |
+
bot.send_message,
|
1056 |
+
chat_id=chat_id,
|
1057 |
+
text=user_feedback_message
|
1058 |
+
)
|
1059 |
+
except Exception as final_err:
|
1060 |
+
logger.error(f"[Task {task_id}] Failed to send the final unexpected error feedback: {final_err}")
|
1061 |
|
1062 |
finally:
|
1063 |
# --- Cleanup ---
|
1064 |
+
# Delete the temporary "Processing..." message if it exists and wasn't edited/handled
|
1065 |
+
if message_to_delete_later_id and bot:
|
|
|
1066 |
try:
|
1067 |
+
await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=message_to_delete_later_id)
|
1068 |
+
logger.debug(f"[Task {task_id}] Deleted temporary status message {message_to_delete_later_id}")
|
1069 |
except Exception as del_e:
|
1070 |
+
logger.warning(f"[Task {task_id}] Failed to delete temporary status message {message_to_delete_later_id}: {del_e}")
|
1071 |
+
|
1072 |
+
# Explicitly DO NOT delete the original message with buttons (status_message_id)
|
1073 |
+
# if it was successfully edited with the final result or error message.
|
1074 |
+
# The logic above sets status_message_id = None if it was edited.
|
1075 |
+
# If status_message_id still holds the ID here, it means editing failed and we sent a *new* message.
|
1076 |
+
# In that failure case, maybe we *should* delete the original button message? Or leave it?
|
1077 |
+
# Let's leave it for now to avoid deleting user context if things went very wrong.
|
1078 |
+
# Deleting message_to_delete_later_id covers the main cleanup case.
|
1079 |
|
1080 |
# Close the background bot's HTTP client
|
1081 |
if background_request and hasattr(background_request, '_client') and background_request._client:
|
|
|
1090 |
|
1091 |
# --- Telegram Handlers ---
|
1092 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler)
|
1093 |
+
# These remain largely the same.
|
|
|
1094 |
|
1095 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1096 |
# ... (Keep existing implementation) ...
|
|
|
1104 |
user = update.effective_user
|
1105 |
if not user or not update.message: return
|
1106 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) used /help.")
|
1107 |
+
# Updated help text slightly
|
1108 |
help_text = ( "π **How to use:**\n\n"
|
1109 |
"1. Send me any YouTube video link or website URL.\n"
|
1110 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
1111 |
"3. Click the button for your choice.\n"
|
1112 |
"4. Wait for the summary!\n\n"
|
1113 |
"βοΈ **Behind the scenes:**\n"
|
1114 |
+
f"β’ **Websites:** I try `Crawl4AI` (smart crawl), then `BeautifulSoup` (basic scrape), and `urltotext.com` API (if configured & credits available).\n"
|
1115 |
+
"β’ **YouTube:** I use `youtube-transcript-api` first, then fall back to `Supadata` and `Apify` APIs if needed.\n"
|
1116 |
+
f"β’ **Summaries:** Generated using Google `{GEMINI_MODEL}` (primary) or `{OPENROUTER_MODEL}` (fallback, if configured).\n\n"
|
1117 |
"**Commands:**\n"
|
1118 |
"`/start` - Display welcome message\n"
|
1119 |
"`/help` - Show this help message" )
|
|
|
1126 |
if not user: return
|
1127 |
# Basic URL validation
|
1128 |
if not (url.startswith('http://') or url.startswith('https://')) or '.' not in url[8:]:
|
|
|
1129 |
logger.debug(f"Ignoring non-URL from {user.id}: {url}")
|
1130 |
# Optionally reply to the user that it doesn't look like a valid URL
|
1131 |
+
await update.message.reply_text("Hmm, that doesn't look like a valid web URL. Please make sure it starts with `http://` or `https://`.", parse_mode=ParseMode.MARKDOWN)
|
1132 |
return
|
1133 |
logger.info(f"User {user.id} ({user.username or 'N/A'}) sent potential URL: {url}")
|
1134 |
# Store URL and original message ID in user_data
|
|
|
1145 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
1146 |
user = query.from_user; summary_type = query.data; query_id = query.id
|
1147 |
try: await query.answer(); logger.debug(f"Ack callback {query_id} from {user.id} ({user.username or 'N/A'})")
|
1148 |
+
except Exception as e: logger.error(f"Error answering callback {query_id}: {e}", exc_info=True) # Log but continue
|
1149 |
|
1150 |
url = context.user_data.get('url_to_summarize')
|
1151 |
message_id_to_edit = query.message.message_id # The message with the buttons
|
|
|
1154 |
if not url:
|
1155 |
logger.warning(f"No URL in context for user {user.id} (cb {query_id}). Expired?")
|
1156 |
try:
|
|
|
1157 |
await query.edit_message_text(text="Sorry, I seem to have lost the context for that link. π€ Please send the URL again.", reply_markup=None)
|
1158 |
except BadRequest as e:
|
1159 |
if "message is not modified" in str(e).lower(): pass # Ignore if text is the same
|
1160 |
else: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
1161 |
+
except Exception as e: logger.error(f"Failed edit 'URL not found' msg: {e}")
|
1162 |
+
return # Do not proceed further
|
|
|
|
|
1163 |
|
1164 |
+
# Clear context *only after* successfully scheduling the task below
|
1165 |
+
# context.user_data.pop('url_to_summarize', None) # Moved clearing
|
1166 |
+
# context.user_data.pop('original_message_id', None) # Moved clearing
|
|
|
1167 |
|
1168 |
# Check necessary configurations before scheduling
|
1169 |
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
|
1170 |
if not TELEGRAM_TOKEN:
|
1171 |
logger.critical("TG TOKEN missing! Cannot schedule task.")
|
1172 |
+
try: await query.edit_message_text(text="β Bot configuration error (Token Missing). Cannot proceed.", reply_markup=None)
|
1173 |
except Exception: pass
|
1174 |
return
|
1175 |
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
|
1176 |
logger.critical("Neither Gemini nor OpenRouter API keys are configured/valid! Cannot summarize.")
|
1177 |
+
try: await query.edit_message_text(text="β AI configuration error: No summarization models are available. Cannot proceed.", reply_markup=None)
|
1178 |
except Exception: pass
|
1179 |
return
|
1180 |
# Log warnings if one model is missing, but proceed if at least one is available
|
|
|
1190 |
message_id_to_edit=message_id_to_edit, # Pass the ID of the message with buttons
|
1191 |
url=url,
|
1192 |
summary_type=summary_type,
|
1193 |
+
bot_token=TELEGRAM_TOKEN # Pass token explicitly
|
1194 |
),
|
1195 |
name=f"SummaryTask-{user.id}-{message_id_to_edit}"
|
1196 |
)
|
1197 |
|
1198 |
+
# Clear context AFTER scheduling the task to prevent race conditions if user clicks fast
|
1199 |
context.user_data.pop('url_to_summarize', None)
|
1200 |
context.user_data.pop('original_message_id', None)
|
1201 |
logger.debug(f"Cleared URL context for user {user.id} after scheduling task.")
|
1202 |
|
1203 |
+
# Optionally edit the button message *immediately* to give feedback before the task edits it again
|
1204 |
+
# This prevents the user clicking again while the task starts up.
|
1205 |
+
# try:
|
1206 |
+
# await query.edit_message_text(text=f"Okay, starting '{summary_type}' summary...", reply_markup=None)
|
1207 |
+
# except Exception as e:
|
1208 |
+
# logger.warning(f"Could not edit button message immediately after scheduling: {e}")
|
1209 |
+
# This initial edit will be quickly overwritten by the task's "Processing..." message.
|
1210 |
+
|
1211 |
+
|
1212 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1213 |
# ... (Keep existing implementation) ...
|
1214 |
+
# Consider adding specific TelegramError types if needed
|
1215 |
+
ignore_errors = (AttributeError, BadRequest, TimedOut, NetworkError, RetryAfter) # Add common transient errors
|
1216 |
if isinstance(context.error, ignore_errors):
|
1217 |
+
ignore_messages = ["message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user"]
|
1218 |
+
err_str = str(context.error).lower()
|
1219 |
+
if any(msg in err_str for msg in ignore_messages) or isinstance(context.error, (TimedOut, NetworkError, RetryAfter)):
|
1220 |
+
logger.warning(f"Ignoring known/handled/transient error in error_handler: {context.error}")
|
1221 |
return
|
1222 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
1223 |
# Consider notifying the user about unexpected errors if appropriate and possible
|
|
|
1237 |
# Add Handlers
|
1238 |
application.add_handler(CommandHandler("start", start))
|
1239 |
application.add_handler(CommandHandler("help", help_command))
|
1240 |
+
# Use a slightly broader filter to catch URLs even without explicit entity type from Telegram
|
1241 |
+
url_filter = filters.TEXT & ~filters.COMMAND & (filters.Entity("url") | filters.Entity("text_link") | filters.Regex(r'https?://[^\s]+'))
|
1242 |
+
application.add_handler(MessageHandler(url_filter, handle_potential_url))
|
1243 |
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
|
1244 |
# Error Handler
|
1245 |
application.add_error_handler(error_handler)
|
|
|
1308 |
logger.error(f"FATAL: Failed to set webhook: {e}", exc_info=True)
|
1309 |
raise RuntimeError(f"Failed to set webhook: {e}") from e
|
1310 |
else:
|
1311 |
+
# Attempt to get URL from request headers if available (might work in some environments)
|
1312 |
+
# This is less reliable than SPACE_HOST
|
1313 |
+
logger.warning("SPACE_HOST environment variable not found. Webhook URL cannot be determined reliably for setup.")
|
1314 |
+
# You might decide to raise an error or try to run in polling mode if webhook fails
|
1315 |
+
raise RuntimeError("Webhook URL undetermined (SPACE_HOST missing).")
|
1316 |
+
|
1317 |
|
1318 |
logger.info("ASGI Lifespan: Startup complete.");
|
1319 |
yield # Application runs here
|
|
|
1348 |
|
1349 |
|
1350 |
async def health_check(request: Request) -> PlainTextResponse:
|
1351 |
+
# ... (Keep existing implementation, updated with model names) ...
|
1352 |
+
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled, _crawl4ai_primary_web_enabled, _urltotext_fallback_enabled, SUPADATA_API_KEY
|
1353 |
bot_status = "Not Initialized"
|
1354 |
bot_username = "N/A"
|
1355 |
+
if ptb_app and ptb_app.bot and ptb_app.initialized: # Check if initialized
|
1356 |
try:
|
1357 |
+
# Quick check if webhook seems ok, more reliable than get_me() sometimes
|
1358 |
+
wh_info = await ptb_app.bot.get_webhook_info()
|
1359 |
+
if ptb_app.running and wh_info and wh_info.url:
|
1360 |
bot_info = await ptb_app.bot.get_me()
|
1361 |
bot_username = f"@{bot_info.username}"
|
1362 |
+
bot_status = f"Running (Webhook OK, {bot_username})"
|
1363 |
+
elif ptb_app.running:
|
1364 |
+
bot_status = "Running (Webhook check failed or not set)"
|
1365 |
else: bot_status = "Initialized/Not running"
|
1366 |
except Exception as e: bot_status = f"Error checking status: {e}"
|
1367 |
+
elif ptb_app:
|
1368 |
+
bot_status = "Initializing..."
|
1369 |
+
|
1370 |
|
1371 |
health_info = [
|
1372 |
+
f"=== Telegram Summary Bot Status ===",
|
1373 |
+
f"Bot Application: {bot_status}",
|
1374 |
+
"--- Services ---",
|
1375 |
+
f"Primary Web Scraper: {'Crawl4AI' if _crawl4ai_primary_web_enabled else 'DISABLED (Lib Missing)'}",
|
1376 |
f"Fallback Web Scraper 1: BeautifulSoup",
|
1377 |
+
f"Fallback Web Scraper 2: {'urltotext.com API' if _urltotext_fallback_enabled else 'DISABLED (No Key)'}",
|
1378 |
+
f"Primary Summarizer: {'Gemini (' + GEMINI_MODEL + ')' if _gemini_primary_enabled else 'DISABLED (No Key/Lib)'}",
|
1379 |
+
f"Fallback Summarizer: {'OpenRouter (' + OPENROUTER_MODEL + ')' if _openrouter_fallback_enabled else 'DISABLED (No Key)'}",
|
1380 |
f"Primary YT Transcript: youtube-transcript-api",
|
1381 |
+
f"Fallback YT Transcript 1: {'Supadata API' if SUPADATA_API_KEY else 'DISABLED (No Key)'}",
|
1382 |
+
f"Fallback YT Transcript 2: {'Apify (' + APIFY_ACTOR_ID + ')' if _apify_token_exists else 'DISABLED (No Key)'}"
|
1383 |
]
|
1384 |
return PlainTextResponse("\n".join(health_info))
|
1385 |
|
|
|
1440 |
log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
|
1441 |
# Use the PORT env var for local running too, defaulting to 8080
|
1442 |
local_port = int(os.environ.get('PORT', 8080))
|
1443 |
+
|
1444 |
+
# Make sure necessary env vars are loaded for local dev if not set system-wide
|
1445 |
+
# Example using python-dotenv if you add it to requirements-dev.txt
|
1446 |
+
# from dotenv import load_dotenv
|
1447 |
+
# load_dotenv()
|
1448 |
+
# logger.info("Loaded environment variables from .env file for local development.")
|
1449 |
+
|
1450 |
+
# Re-check required tokens after potential .env load
|
1451 |
+
if not get_secret('TELEGRAM_TOKEN'): logger.critical("Local Dev: TELEGRAM_TOKEN not found.")
|
1452 |
+
if not get_secret('GEMINI_API_KEY'): logger.error("Local Dev: GEMINI_API_KEY not found.")
|
1453 |
+
# Add checks for other keys as needed for local testing
|
1454 |
+
|
1455 |
uvicorn.run(
|
1456 |
"main:app",
|
1457 |
host='0.0.0.0',
|