Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# main.py (
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
@@ -99,16 +99,16 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
|
99 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
100 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
101 |
|
102 |
-
#
|
103 |
# Model Priority:
|
104 |
# 1. Groq Llama 4 Scout
|
105 |
# 2. Gemini 2.5 Pro Exp
|
106 |
# 3. Gemini 2.0 Flash
|
107 |
# 4. OpenRouter DeepSeek V3 Free
|
108 |
-
GROQ_LLAMA4_MODEL = os.environ.get("GROQ_LLAMA4_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct") #
|
109 |
GEMINI_PRO_EXP_MODEL = os.environ.get("GEMINI_PRO_EXP_MODEL", "gemini-2.5-pro-exp-03-25")
|
110 |
GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001")
|
111 |
-
OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") #
|
112 |
|
113 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # YT Default
|
114 |
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Scrape Fallback 4
|
@@ -184,14 +184,9 @@ def extract_youtube_id(url):
|
|
184 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
185 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
186 |
|
|
|
187 |
# --- Content Fetching Functions ---
|
188 |
-
|
189 |
-
# fetch_url_content_for_scrape, get_website_content, get_website_content_via_api,
|
190 |
-
# get_website_content_via_scrapers_proxy, get_website_content_via_ai_web_scraper,
|
191 |
-
# _run_apify_actor_for_web_content, get_website_content_via_apify_crawler,
|
192 |
-
# get_website_content_via_apify_text_scraper remain UNCHANGED. They are omitted here for brevity
|
193 |
-
# but MUST be included in the final main.py file)
|
194 |
-
# --- START OMITTED CONTENT FETCHING FUNCTIONS ---
|
195 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
196 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
197 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
@@ -204,10 +199,10 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
204 |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
|
205 |
if response.status_code == 200:
|
206 |
try:
|
207 |
-
data = response.json() if response.text else None
|
208 |
content = None
|
209 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
210 |
-
if not content and response.text: content = response.text
|
211 |
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
|
212 |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
|
213 |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
|
@@ -294,6 +289,7 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
294 |
return transcript_text
|
295 |
|
296 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
|
|
297 |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
298 |
try:
|
299 |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
@@ -313,6 +309,7 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
313 |
return None
|
314 |
|
315 |
async def get_website_content(url: str) -> Optional[str]:
|
|
|
316 |
if not url: logger.error("[Web Scrape Primary] No URL provided"); return None
|
317 |
logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}")
|
318 |
html_content = await fetch_url_content_for_scrape(url)
|
@@ -336,6 +333,7 @@ async def get_website_content(url: str) -> Optional[str]:
|
|
336 |
except Exception as e: logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True); return None
|
337 |
|
338 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
|
339 |
if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None
|
340 |
if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None
|
341 |
logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API")
|
@@ -363,6 +361,7 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
363 |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
364 |
|
365 |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
|
|
|
366 |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
|
367 |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
|
368 |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
|
@@ -396,6 +395,7 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
|
|
396 |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
397 |
|
398 |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
|
|
|
399 |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
|
400 |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
|
401 |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API")
|
@@ -432,6 +432,7 @@ async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Opti
|
|
432 |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
433 |
|
434 |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]:
|
|
|
435 |
if not url: logger.error(f"[{actor_name}] No URL provided"); return None
|
436 |
if not api_token: logger.error(f"[{actor_name}] API token missing."); return None
|
437 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
@@ -474,11 +475,12 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
474 |
except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
475 |
|
476 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
|
|
477 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, actor_name="Apify Crawler" )
|
478 |
|
479 |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]:
|
|
|
480 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, actor_name="Apify Text Scraper" )
|
481 |
-
# --- END OMITTED CONTENT FETCHING FUNCTIONS ---
|
482 |
|
483 |
|
484 |
# --- Summarization Functions (Using Specific April 2025 Models) ---
|
@@ -516,7 +518,6 @@ PROMPT_POINTS = (
|
|
516 |
"Here is the text to summarise:"
|
517 |
)
|
518 |
|
519 |
-
# <<< Uses the specific GROQ_LLAMA4_MODEL constant >>>
|
520 |
async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
521 |
"""Internal function to call Groq API (Primary - Llama 4 Scout). Returns (summary, error_message)."""
|
522 |
global GROQ_API_KEY, GROQ_LLAMA4_MODEL, _groq_enabled
|
@@ -526,9 +527,7 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
|
|
526 |
logger.info(f"[Groq Primary] Generating {summary_type} summary using {GROQ_LLAMA4_MODEL}. Input length: {len(text)}")
|
527 |
|
528 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
529 |
-
|
530 |
-
# Input Length Check for Llama 4 Scout (16k context? Be conservative)
|
531 |
-
MAX_INPUT_LENGTH_GROQ = 40000 # ~13k tokens
|
532 |
if len(text) > MAX_INPUT_LENGTH_GROQ:
|
533 |
logger.warning(f"[Groq Primary] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GROQ}). Truncating.");
|
534 |
text = text[:MAX_INPUT_LENGTH_GROQ] + "... (Content truncated)"
|
@@ -537,18 +536,14 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
|
|
537 |
try:
|
538 |
groq_client = Groq( api_key=GROQ_API_KEY, timeout=httpx.Timeout(120.0, connect=10.0) )
|
539 |
logger.info(f"[Groq Primary] Sending request to Groq ({GROQ_LLAMA4_MODEL})...")
|
540 |
-
|
541 |
chat_completion = await groq_client.chat.completions.create(
|
542 |
messages=[ { "role": "user", "content": full_prompt } ],
|
543 |
-
model=GROQ_LLAMA4_MODEL,
|
544 |
-
temperature=0.7, #
|
545 |
-
max_tokens=2048, #
|
546 |
-
top_p=1,
|
547 |
-
stream=False,
|
548 |
-
stop=None,
|
549 |
)
|
550 |
logger.info("[Groq Primary] Received response from Groq.")
|
551 |
-
|
552 |
if chat_completion.choices and chat_completion.choices[0].message and chat_completion.choices[0].message.content:
|
553 |
summary = chat_completion.choices[0].message.content
|
554 |
finish_reason = chat_completion.choices[0].finish_reason
|
@@ -558,7 +553,6 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
|
|
558 |
logger.warning(f"[Groq Primary] Groq response structure unexpected or content empty. Response: {chat_completion.model_dump_json(indent=2)}")
|
559 |
finish_reason = chat_completion.choices[0].finish_reason if chat_completion.choices else 'N/A'
|
560 |
return None, f"Sorry, the primary AI model ({GROQ_LLAMA4_MODEL}) provided an empty or invalid response (Finish Reason: {finish_reason})."
|
561 |
-
|
562 |
except GroqError as ge:
|
563 |
logger.error(f"[Groq Primary] Groq API error: {ge.status_code} - {ge.message}", exc_info=False)
|
564 |
error_msg = f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) failed. API Error: {ge.status_code}."
|
@@ -575,7 +569,6 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
|
|
575 |
logger.error(f"[Groq Primary] Unexpected error during Groq API call: {e}", exc_info=True);
|
576 |
return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GROQ_LLAMA4_MODEL})."
|
577 |
|
578 |
-
|
579 |
async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[Optional[str], Optional[str]]:
|
580 |
"""Internal function to call Gemini API. Returns (summary, error_message)."""
|
581 |
global _gemini_api_enabled
|
@@ -639,7 +632,6 @@ async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[O
|
|
639 |
error_msg = f"Sorry, an unexpected error occurred while using the AI service ({model_name})."
|
640 |
return None, error_msg
|
641 |
|
642 |
-
# <<< Uses the specific OPENROUTER_DEEPSEEK_MODEL constant >>>
|
643 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
644 |
"""Internal function to call OpenRouter API (Final Fallback - DeepSeek V3 Free). Returns (summary, error_message)."""
|
645 |
global OPENROUTER_API_KEY, OPENROUTER_DEEPSEEK_MODEL, _openrouter_fallback_enabled
|
@@ -655,17 +647,13 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
|
|
655 |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
|
656 |
full_prompt = f"{prompt}\n\n{text}"
|
657 |
|
658 |
-
# Use the direct httpx call as before, ensuring the correct model name is in the payload
|
659 |
headers = {
|
660 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
661 |
"Content-Type": "application/json",
|
662 |
-
"HTTP-Referer": os.environ.get("YOUR_SITE_URL", "https://github.com/your-repo"),
|
663 |
-
"X-Title": os.environ.get("YOUR_SITE_NAME", "TelegramSummariserBot")
|
664 |
-
}
|
665 |
-
payload = {
|
666 |
-
"model": OPENROUTER_DEEPSEEK_MODEL, # <<< Use specific DeepSeek model name
|
667 |
-
"messages": [{"role": "user", "content": full_prompt}],
|
668 |
}
|
|
|
669 |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
670 |
api_timeouts = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=60.0)
|
671 |
response = None
|
@@ -675,7 +663,6 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
|
|
675 |
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}) with read timeout {api_timeouts.read}s...")
|
676 |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
|
677 |
logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}")
|
678 |
-
|
679 |
if response.status_code == 200:
|
680 |
try:
|
681 |
data = response.json()
|
@@ -701,12 +688,10 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
|
|
701 |
except Exception: pass
|
702 |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}");
|
703 |
return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) returned unexpected status ({response.status_code})."
|
704 |
-
|
705 |
except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}"); return None, f"Sorry, the fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) timed out."
|
706 |
except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Sorry, there was an error connecting to the fallback AI model service."
|
707 |
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Sorry, an unexpected error occurred while using the fallback AI service."
|
708 |
|
709 |
-
|
710 |
async def generate_summary(text: str, summary_type: str) -> str:
|
711 |
"""
|
712 |
Generates summary using the specific model hierarchy (April 2025):
|
@@ -722,7 +707,7 @@ async def generate_summary(text: str, summary_type: str) -> str:
|
|
722 |
logger.info("[Summary Generation] Starting process with specific April 2025 model hierarchy.")
|
723 |
summary: Optional[str] = None
|
724 |
errors: Dict[str, Optional[str]] = {
|
725 |
-
"Llama4Scout": None,
|
726 |
"GeminiProExp": None,
|
727 |
"GeminiFlash": None,
|
728 |
"DeepSeekV3": None,
|
@@ -788,9 +773,6 @@ async def generate_summary(text: str, summary_type: str) -> str:
|
|
788 |
|
789 |
|
790 |
# --- Main Processing Logic ---
|
791 |
-
# (process_summary_task remains UNCHANGED in its core logic, it correctly calls the updated generate_summary.
|
792 |
-
# Omitted here for brevity, but MUST be included in the final file.)
|
793 |
-
# --- START OMITTED process_summary_task ---
|
794 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
795 |
"""Handles the entire process: fetching content (with ALL fallbacks) and summarizing."""
|
796 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
@@ -903,13 +885,9 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
903 |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
|
904 |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
|
905 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
906 |
-
# --- END OMITTED process_summary_task ---
|
907 |
|
908 |
|
909 |
# --- Telegram Handlers ---
|
910 |
-
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler
|
911 |
-
# remain UNCHANGED. Omitted here for brevity, but include in final file.)
|
912 |
-
# --- START OMITTED TELEGRAM HANDLERS ---
|
913 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
914 |
user = update.effective_user; mention = user.mention_html()
|
915 |
if not user or not update.message: return
|
@@ -958,7 +936,6 @@ async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYP
|
|
958 |
else:
|
959 |
logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}")
|
960 |
|
961 |
-
|
962 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
963 |
query = update.callback_query
|
964 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
@@ -1016,14 +993,9 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1016 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
1017 |
"""Log Errors caused by Updates."""
|
1018 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
1019 |
-
# --- END OMITTED TELEGRAM HANDLERS ---
|
1020 |
|
1021 |
|
1022 |
# --- Application Setup & Web Framework ---
|
1023 |
-
# (setup_bot_config, lifespan, telegram_webhook, app definition
|
1024 |
-
# remain UNCHANGED. health_check is modified below.
|
1025 |
-
# Omitted here for brevity, include in final file.)
|
1026 |
-
# --- START OMITTED APP SETUP/WEB FRAMEWORK (excluding health_check) ---
|
1027 |
async def setup_bot_config() -> Application:
|
1028 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
1029 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
@@ -1093,32 +1065,6 @@ async def lifespan(app: Starlette):
|
|
1093 |
else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.")
|
1094 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1095 |
|
1096 |
-
async def telegram_webhook(request: Request) -> Response:
|
1097 |
-
"""Handles incoming updates from Telegram."""
|
1098 |
-
global WEBHOOK_SECRET
|
1099 |
-
if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
|
1100 |
-
if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503)
|
1101 |
-
if WEBHOOK_SECRET:
|
1102 |
-
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
1103 |
-
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
|
1104 |
-
try:
|
1105 |
-
update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot)
|
1106 |
-
logger.debug(f"Processing update_id: {update.update_id} via webhook"); await ptb_app.process_update(update)
|
1107 |
-
return Response(status_code=200)
|
1108 |
-
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1109 |
-
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200)
|
1110 |
-
|
1111 |
-
# --- Starlette App Definition ---
|
1112 |
-
# Note: health_check is defined below
|
1113 |
-
app = Starlette(
|
1114 |
-
debug=False,
|
1115 |
-
lifespan=lifespan,
|
1116 |
-
routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ]
|
1117 |
-
)
|
1118 |
-
logger.info("Starlette ASGI application created with health check and webhook routes.")
|
1119 |
-
# --- END OMITTED APP SETUP/WEB FRAMEWORK ---
|
1120 |
-
|
1121 |
-
# <<< CHANGE: Updated health check response for specific models >>>
|
1122 |
async def health_check(request: Request) -> PlainTextResponse:
|
1123 |
"""Simple health check endpoint."""
|
1124 |
global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL, APIFY_ACTOR_ID
|
@@ -1158,6 +1104,32 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1158 |
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
|
1159 |
)
|
1160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1161 |
# --- Development Server (if run directly) ---
|
1162 |
if __name__ == '__main__':
|
1163 |
import uvicorn
|
|
|
1 |
+
# main.py (Full Code - Specific April 2025 Models: Llama 4 Scout & DeepSeek V3 Free)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
|
|
99 |
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
100 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
101 |
|
102 |
+
# --- Model Configurations (Specific April 2025) ---
|
103 |
# Model Priority:
|
104 |
# 1. Groq Llama 4 Scout
|
105 |
# 2. Gemini 2.5 Pro Exp
|
106 |
# 3. Gemini 2.0 Flash
|
107 |
# 4. OpenRouter DeepSeek V3 Free
|
108 |
+
GROQ_LLAMA4_MODEL = os.environ.get("GROQ_LLAMA4_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct") # Specific Llama 4 model
|
109 |
GEMINI_PRO_EXP_MODEL = os.environ.get("GEMINI_PRO_EXP_MODEL", "gemini-2.5-pro-exp-03-25")
|
110 |
GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001")
|
111 |
+
OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # Specific DeepSeek model
|
112 |
|
113 |
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # YT Default
|
114 |
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Scrape Fallback 4
|
|
|
184 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
185 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
186 |
|
187 |
+
|
188 |
# --- Content Fetching Functions ---
|
189 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
191 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
192 |
if not api_key: logger.error("[Supadata] API key missing."); return None
|
|
|
199 |
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
|
200 |
if response.status_code == 200:
|
201 |
try:
|
202 |
+
data = response.json() if response.text else None
|
203 |
content = None
|
204 |
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
|
205 |
+
if not content and response.text: content = response.text
|
206 |
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
|
207 |
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
|
208 |
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
|
|
|
289 |
return transcript_text
|
290 |
|
291 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
292 |
+
"""Directly fetches URL content using httpx."""
|
293 |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
294 |
try:
|
295 |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
|
|
309 |
return None
|
310 |
|
311 |
async def get_website_content(url: str) -> Optional[str]:
|
312 |
+
"""Primary method: Fetches HTML directly and parses with BeautifulSoup."""
|
313 |
if not url: logger.error("[Web Scrape Primary] No URL provided"); return None
|
314 |
logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}")
|
315 |
html_content = await fetch_url_content_for_scrape(url)
|
|
|
333 |
except Exception as e: logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True); return None
|
334 |
|
335 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
336 |
+
"""Fallback 1: Fetches website content using urltotext.com API."""
|
337 |
if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None
|
338 |
if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None
|
339 |
logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API")
|
|
|
361 |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
362 |
|
363 |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
|
364 |
+
"""Fallback 2: Fetches website content using Scraper's Proxy Parser via RapidAPI."""
|
365 |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
|
366 |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
|
367 |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
|
|
|
395 |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
396 |
|
397 |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
|
398 |
+
"""Fallback 3: Fetches website content using AI Web Scraper via RapidAPI."""
|
399 |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
|
400 |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
|
401 |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API")
|
|
|
432 |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
433 |
|
434 |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]:
|
435 |
+
"""Generic function to run an Apify actor and get text content."""
|
436 |
if not url: logger.error(f"[{actor_name}] No URL provided"); return None
|
437 |
if not api_token: logger.error(f"[{actor_name}] API token missing."); return None
|
438 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
|
|
475 |
except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
476 |
|
477 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
478 |
+
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
479 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, actor_name="Apify Crawler" )
|
480 |
|
481 |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]:
|
482 |
+
"""Fallback 5: Fetches website content using Apify Text Scraper Free."""
|
483 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, actor_name="Apify Text Scraper" )
|
|
|
484 |
|
485 |
|
486 |
# --- Summarization Functions (Using Specific April 2025 Models) ---
|
|
|
518 |
"Here is the text to summarise:"
|
519 |
)
|
520 |
|
|
|
521 |
async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
522 |
"""Internal function to call Groq API (Primary - Llama 4 Scout). Returns (summary, error_message)."""
|
523 |
global GROQ_API_KEY, GROQ_LLAMA4_MODEL, _groq_enabled
|
|
|
527 |
logger.info(f"[Groq Primary] Generating {summary_type} summary using {GROQ_LLAMA4_MODEL}. Input length: {len(text)}")
|
528 |
|
529 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
530 |
+
MAX_INPUT_LENGTH_GROQ = 40000 # ~13k tokens for 16k context
|
|
|
|
|
531 |
if len(text) > MAX_INPUT_LENGTH_GROQ:
|
532 |
logger.warning(f"[Groq Primary] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GROQ}). Truncating.");
|
533 |
text = text[:MAX_INPUT_LENGTH_GROQ] + "... (Content truncated)"
|
|
|
536 |
try:
|
537 |
groq_client = Groq( api_key=GROQ_API_KEY, timeout=httpx.Timeout(120.0, connect=10.0) )
|
538 |
logger.info(f"[Groq Primary] Sending request to Groq ({GROQ_LLAMA4_MODEL})...")
|
|
|
539 |
chat_completion = await groq_client.chat.completions.create(
|
540 |
messages=[ { "role": "user", "content": full_prompt } ],
|
541 |
+
model=GROQ_LLAMA4_MODEL,
|
542 |
+
temperature=0.7, # Adjust from Groq default of 1 if needed
|
543 |
+
max_tokens=2048, # Adjust from Groq default of 1024 if needed
|
544 |
+
top_p=1, stream=False, stop=None,
|
|
|
|
|
545 |
)
|
546 |
logger.info("[Groq Primary] Received response from Groq.")
|
|
|
547 |
if chat_completion.choices and chat_completion.choices[0].message and chat_completion.choices[0].message.content:
|
548 |
summary = chat_completion.choices[0].message.content
|
549 |
finish_reason = chat_completion.choices[0].finish_reason
|
|
|
553 |
logger.warning(f"[Groq Primary] Groq response structure unexpected or content empty. Response: {chat_completion.model_dump_json(indent=2)}")
|
554 |
finish_reason = chat_completion.choices[0].finish_reason if chat_completion.choices else 'N/A'
|
555 |
return None, f"Sorry, the primary AI model ({GROQ_LLAMA4_MODEL}) provided an empty or invalid response (Finish Reason: {finish_reason})."
|
|
|
556 |
except GroqError as ge:
|
557 |
logger.error(f"[Groq Primary] Groq API error: {ge.status_code} - {ge.message}", exc_info=False)
|
558 |
error_msg = f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) failed. API Error: {ge.status_code}."
|
|
|
569 |
logger.error(f"[Groq Primary] Unexpected error during Groq API call: {e}", exc_info=True);
|
570 |
return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GROQ_LLAMA4_MODEL})."
|
571 |
|
|
|
572 |
async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[Optional[str], Optional[str]]:
|
573 |
"""Internal function to call Gemini API. Returns (summary, error_message)."""
|
574 |
global _gemini_api_enabled
|
|
|
632 |
error_msg = f"Sorry, an unexpected error occurred while using the AI service ({model_name})."
|
633 |
return None, error_msg
|
634 |
|
|
|
635 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
636 |
"""Internal function to call OpenRouter API (Final Fallback - DeepSeek V3 Free). Returns (summary, error_message)."""
|
637 |
global OPENROUTER_API_KEY, OPENROUTER_DEEPSEEK_MODEL, _openrouter_fallback_enabled
|
|
|
647 |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
|
648 |
full_prompt = f"{prompt}\n\n{text}"
|
649 |
|
|
|
650 |
headers = {
|
651 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
652 |
"Content-Type": "application/json",
|
653 |
+
"HTTP-Referer": os.environ.get("YOUR_SITE_URL", "https://github.com/your-repo"),
|
654 |
+
"X-Title": os.environ.get("YOUR_SITE_NAME", "TelegramSummariserBot")
|
|
|
|
|
|
|
|
|
655 |
}
|
656 |
+
payload = { "model": OPENROUTER_DEEPSEEK_MODEL, "messages": [{"role": "user", "content": full_prompt}], }
|
657 |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
658 |
api_timeouts = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=60.0)
|
659 |
response = None
|
|
|
663 |
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}) with read timeout {api_timeouts.read}s...")
|
664 |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
|
665 |
logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}")
|
|
|
666 |
if response.status_code == 200:
|
667 |
try:
|
668 |
data = response.json()
|
|
|
688 |
except Exception: pass
|
689 |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}");
|
690 |
return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) returned unexpected status ({response.status_code})."
|
|
|
691 |
except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}"); return None, f"Sorry, the fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) timed out."
|
692 |
except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Sorry, there was an error connecting to the fallback AI model service."
|
693 |
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Sorry, an unexpected error occurred while using the fallback AI service."
|
694 |
|
|
|
695 |
async def generate_summary(text: str, summary_type: str) -> str:
|
696 |
"""
|
697 |
Generates summary using the specific model hierarchy (April 2025):
|
|
|
707 |
logger.info("[Summary Generation] Starting process with specific April 2025 model hierarchy.")
|
708 |
summary: Optional[str] = None
|
709 |
errors: Dict[str, Optional[str]] = {
|
710 |
+
"Llama4Scout": None,
|
711 |
"GeminiProExp": None,
|
712 |
"GeminiFlash": None,
|
713 |
"DeepSeekV3": None,
|
|
|
773 |
|
774 |
|
775 |
# --- Main Processing Logic ---
|
|
|
|
|
|
|
776 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
777 |
"""Handles the entire process: fetching content (with ALL fallbacks) and summarizing."""
|
778 |
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
|
|
|
885 |
try: await background_request._client.aclose(); logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
|
886 |
except Exception as e: logger.warning(f"[Task {task_id}] Error closing background bot's client: {e}")
|
887 |
logger.info(f"[Task {task_id}] Task completed. Success: {success}")
|
|
|
888 |
|
889 |
|
890 |
# --- Telegram Handlers ---
|
|
|
|
|
|
|
891 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
892 |
user = update.effective_user; mention = user.mention_html()
|
893 |
if not user or not update.message: return
|
|
|
936 |
else:
|
937 |
logger.debug(f"Ignoring non-URL message from {user.id}: {url[:100]}")
|
938 |
|
|
|
939 |
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
940 |
query = update.callback_query
|
941 |
if not query or not query.message or not query.from_user: logger.warning("Callback query missing data."); return
|
|
|
993 |
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
|
994 |
"""Log Errors caused by Updates."""
|
995 |
logger.error("Exception while handling an update:", exc_info=context.error)
|
|
|
996 |
|
997 |
|
998 |
# --- Application Setup & Web Framework ---
|
|
|
|
|
|
|
|
|
999 |
async def setup_bot_config() -> Application:
|
1000 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
1001 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
|
|
1065 |
else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.")
|
1066 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1067 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1068 |
async def health_check(request: Request) -> PlainTextResponse:
|
1069 |
"""Simple health check endpoint."""
|
1070 |
global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL, APIFY_ACTOR_ID
|
|
|
1104 |
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
|
1105 |
)
|
1106 |
|
1107 |
+
async def telegram_webhook(request: Request) -> Response:
|
1108 |
+
"""Handles incoming updates from Telegram."""
|
1109 |
+
global WEBHOOK_SECRET
|
1110 |
+
if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
|
1111 |
+
if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503)
|
1112 |
+
if WEBHOOK_SECRET:
|
1113 |
+
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
1114 |
+
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
|
1115 |
+
try:
|
1116 |
+
update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot)
|
1117 |
+
logger.debug(f"Processing update_id: {update.update_id} via webhook"); await ptb_app.process_update(update)
|
1118 |
+
return Response(status_code=200)
|
1119 |
+
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1120 |
+
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200)
|
1121 |
+
|
1122 |
+
# --- Starlette App Definition ---
|
1123 |
+
app = Starlette(
|
1124 |
+
debug=False, # Keep False for production/Hugging Face
|
1125 |
+
lifespan=lifespan,
|
1126 |
+
routes=[
|
1127 |
+
Route("/", endpoint=health_check, methods=["GET"]),
|
1128 |
+
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]),
|
1129 |
+
]
|
1130 |
+
)
|
1131 |
+
logger.info("Starlette ASGI application created with health check and webhook routes.")
|
1132 |
+
|
1133 |
# --- Development Server (if run directly) ---
|
1134 |
if __name__ == '__main__':
|
1135 |
import uvicorn
|