Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# main.py (Updated for
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
@@ -7,7 +7,7 @@ import json
|
|
7 |
import html
|
8 |
import contextlib
|
9 |
import traceback
|
10 |
-
import urllib.parse
|
11 |
from typing import Optional, Dict, Any, Tuple
|
12 |
|
13 |
# --- Frameworks ---
|
@@ -45,15 +45,14 @@ except ImportError:
|
|
45 |
try:
|
46 |
import google.generativeai as genai
|
47 |
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
48 |
-
_gemini_sdk_available = True
|
49 |
except ImportError:
|
50 |
genai = None
|
51 |
HarmCategory = None
|
52 |
HarmBlockThreshold = None
|
53 |
_gemini_sdk_available = False
|
54 |
-
# logger will be defined later, log warning after logger setup
|
55 |
|
56 |
-
# --- Groq SDK ---
|
57 |
try:
|
58 |
from groq import Groq, GroqError
|
59 |
_groq_sdk_available = True
|
@@ -72,11 +71,11 @@ logging.getLogger('gunicorn.error').setLevel(logging.INFO)
|
|
72 |
logging.getLogger('uvicorn').setLevel(logging.INFO)
|
73 |
logging.getLogger('starlette').setLevel(logging.INFO)
|
74 |
if _gemini_sdk_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING)
|
75 |
-
if _groq_sdk_available: logging.getLogger("groq").setLevel(logging.INFO)
|
76 |
logger = logging.getLogger(__name__)
|
77 |
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}")
|
78 |
if not _gemini_sdk_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.")
|
79 |
-
if not _groq_sdk_available: logger.warning("groq library not found. Groq functionality disabled.")
|
80 |
|
81 |
|
82 |
# --- Global variable for PTB app ---
|
@@ -91,27 +90,27 @@ def get_secret(secret_name):
|
|
91 |
return value
|
92 |
|
93 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
94 |
-
GROQ_API_KEY = get_secret('GROQ_API_KEY') #
|
95 |
-
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') #
|
96 |
-
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') #
|
97 |
-
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
|
98 |
-
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
99 |
-
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
100 |
-
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
101 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
102 |
|
103 |
-
# <<< CHANGE:
|
104 |
# Model Priority:
|
105 |
-
# 1. Groq Llama 4
|
106 |
-
# 2. Gemini 2.5 Pro
|
107 |
# 3. Gemini 2.0 Flash
|
108 |
-
# 4. OpenRouter DeepSeek
|
109 |
-
|
110 |
-
|
111 |
-
GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-
|
112 |
-
|
113 |
|
114 |
-
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") #
|
115 |
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Scrape Fallback 4
|
116 |
APIFY_TEXT_SCRAPER_ACTOR_ID = "karamelo/text-scraper-free" # Scrape Fallback 5
|
117 |
|
@@ -120,23 +119,21 @@ if not TELEGRAM_TOKEN: logger.critical("β FATAL: TELEGRAM_TOKEN not found.");
|
|
120 |
|
121 |
# Summarizer Availability Checks
|
122 |
_groq_enabled = _groq_sdk_available and bool(GROQ_API_KEY)
|
123 |
-
_gemini_api_enabled = _gemini_sdk_available and bool(GEMINI_API_KEY)
|
124 |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
|
125 |
|
126 |
if not _groq_enabled:
|
127 |
-
if not _groq_sdk_available: logger.error("β ERROR: groq library missing. Groq (Llama) disabled.")
|
128 |
-
elif not GROQ_API_KEY: logger.error("β ERROR: GROQ_API_KEY not found. Primary summarization (Groq Llama) will fail.")
|
129 |
if not _gemini_api_enabled:
|
130 |
if not _gemini_sdk_available: logger.warning("β οΈ WARNING: google-generativeai library missing. Gemini disabled.")
|
131 |
-
elif not GEMINI_API_KEY: logger.warning("β οΈ WARNING: GEMINI_API_KEY not found. Gemini summarization fallbacks (Pro/Flash) will fail.")
|
132 |
if not _openrouter_fallback_enabled: logger.warning("β οΈ WARNING: OPENROUTER_API_KEY not found. Final fallback summarization (DeepSeek) will fail.")
|
133 |
|
134 |
if not _groq_enabled and not _gemini_api_enabled and not _openrouter_fallback_enabled:
|
135 |
logger.critical("β FATAL: No summarization models are configured or enabled. Bot cannot function.")
|
136 |
-
# Depending on deployment, might want to raise RuntimeError here
|
137 |
-
# raise RuntimeError("No summarization models configured.")
|
138 |
elif not _groq_enabled:
|
139 |
-
logger.warning("β οΈ Primary summarizer (Groq Llama) is disabled. Will start with Gemini Pro.")
|
140 |
|
141 |
# Scraper Availability Checks (Warnings only)
|
142 |
if not RAPIDAPI_KEY: logger.warning("β οΈ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks (2 & 3) will be unavailable.")
|
@@ -146,25 +143,25 @@ if not SUPADATA_API_KEY: logger.warning("Optional secret 'SUPADATA_API_KEY' not
|
|
146 |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.")
|
147 |
|
148 |
logger.info("Secret loading and configuration check finished.")
|
149 |
-
logger.info(f"Summarizer 1 (Groq): {
|
150 |
-
logger.info(f"Summarizer 2 (Gemini Pro): {
|
151 |
-
logger.info(f"Summarizer 3 (Gemini Flash): {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}")
|
152 |
-
logger.info(f"Summarizer 4 (OpenRouter): {
|
153 |
logger.info(f"Using Apify Actor (YT Default): {APIFY_ACTOR_ID}")
|
154 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}")
|
155 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}")
|
156 |
|
157 |
-
# Flags for scraper key existence
|
158 |
_apify_token_exists = bool(APIFY_API_TOKEN)
|
159 |
_urltotext_key_exists = bool(URLTOTEXT_API_KEY)
|
160 |
_rapidapi_key_exists = bool(RAPIDAPI_KEY)
|
161 |
|
162 |
# --- Configure APIs ---
|
163 |
-
if _gemini_api_enabled:
|
164 |
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.")
|
165 |
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_api_enabled = False
|
166 |
|
167 |
-
# Groq client is
|
168 |
|
169 |
# --- Retry Decorator ---
|
170 |
@retry( stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=15), retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True )
|
@@ -187,14 +184,13 @@ def extract_youtube_id(url):
|
|
187 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
188 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
189 |
|
190 |
-
|
191 |
# --- Content Fetching Functions ---
|
192 |
# (These functions: get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript,
|
193 |
# fetch_url_content_for_scrape, get_website_content, get_website_content_via_api,
|
194 |
# get_website_content_via_scrapers_proxy, get_website_content_via_ai_web_scraper,
|
195 |
# _run_apify_actor_for_web_content, get_website_content_via_apify_crawler,
|
196 |
# get_website_content_via_apify_text_scraper remain UNCHANGED. They are omitted here for brevity
|
197 |
-
# but
|
198 |
# --- START OMITTED CONTENT FETCHING FUNCTIONS ---
|
199 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
200 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
@@ -228,44 +224,30 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
|
|
228 |
|
229 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
230 |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor."""
|
231 |
-
global APIFY_ACTOR_ID
|
232 |
if not video_url: logger.error("[Apify YT] No video_url provided"); return None
|
233 |
if not api_token: logger.error("[Apify YT] API token missing."); return None
|
234 |
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
|
235 |
-
|
236 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
|
237 |
params = {"token": api_token}
|
238 |
-
payload = {
|
239 |
-
"urls": [video_url],
|
240 |
-
"outputFormat": "singleStringText",
|
241 |
-
"maxRetries": 5,
|
242 |
-
"channelHandleBoolean": False,
|
243 |
-
"channelNameBoolean": False,
|
244 |
-
"datePublishedBoolean": False,
|
245 |
-
"relativeDateTextBoolean": False,
|
246 |
-
}
|
247 |
headers = {"Content-Type": "application/json"}
|
248 |
-
|
249 |
try:
|
250 |
-
async with httpx.AsyncClient(timeout=120.0) as client:
|
251 |
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
|
252 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
|
253 |
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
|
254 |
-
|
255 |
if response.status_code == 200:
|
256 |
try:
|
257 |
results = response.json()
|
258 |
if isinstance(results, list) and len(results) > 0:
|
259 |
-
item = results[0]
|
260 |
-
content = None
|
261 |
-
# Check common keys for transcript text
|
262 |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
|
263 |
elif "text" in item and isinstance(item["text"], str): content = item["text"]
|
264 |
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
|
265 |
-
elif "captions" in item and isinstance(item["captions"], list):
|
266 |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
|
267 |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
|
268 |
-
|
269 |
if content and isinstance(content, str): logger.info(f"[Apify YT] Success via REST for {video_url}. Length: {len(content)}"); return content.strip()
|
270 |
else: logger.warning(f"[Apify YT] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None
|
271 |
else: logger.warning(f"[Apify YT] Actor success but dataset was empty for {video_url}. Response: {results}"); return None
|
@@ -281,11 +263,10 @@ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[s
|
|
281 |
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
|
282 |
|
283 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
284 |
-
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
285 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
286 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
287 |
transcript_text = None
|
288 |
-
# Method 1: youtube-transcript-api (Primary)
|
289 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
290 |
try:
|
291 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
@@ -295,8 +276,6 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
295 |
except NoTranscriptFound: logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
|
296 |
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
|
297 |
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
|
298 |
-
|
299 |
-
# Method 2: Supadata (Fallback 1)
|
300 |
if transcript_text is None:
|
301 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
302 |
if SUPADATA_API_KEY:
|
@@ -304,33 +283,26 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
|
|
304 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
305 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
306 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
307 |
-
|
308 |
-
# Method 3: Apify (Fallback 2 - Default YT Actor)
|
309 |
if transcript_text is None:
|
310 |
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
|
311 |
-
if _apify_token_exists:
|
312 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
313 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
|
314 |
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
|
315 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
316 |
-
|
317 |
-
# Final Result
|
318 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
319 |
return transcript_text
|
320 |
|
321 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
322 |
-
"""Directly fetches URL content using httpx. (Primary Web Method - Fetching part)"""
|
323 |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
324 |
try:
|
325 |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
326 |
logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
|
327 |
response = await client.get(url)
|
328 |
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
|
329 |
-
response.raise_for_status()
|
330 |
content_type = response.headers.get('content-type', '').lower()
|
331 |
-
if 'html' not in content_type:
|
332 |
-
logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}")
|
333 |
-
return None
|
334 |
try: return response.text
|
335 |
except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
|
336 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
|
@@ -341,19 +313,15 @@ async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[
|
|
341 |
return None
|
342 |
|
343 |
async def get_website_content(url: str) -> Optional[str]:
|
344 |
-
"""Primary method: Fetches HTML directly and parses with BeautifulSoup."""
|
345 |
if not url: logger.error("[Web Scrape Primary] No URL provided"); return None
|
346 |
logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}")
|
347 |
html_content = await fetch_url_content_for_scrape(url)
|
348 |
-
if not html_content:
|
349 |
-
logger.warning(f"[Web Scrape Primary] Direct fetch failed for {url}.")
|
350 |
-
return None
|
351 |
try:
|
352 |
def parse_html(content: str) -> Optional[str]:
|
353 |
try:
|
354 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
355 |
-
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]):
|
356 |
-
element.extract()
|
357 |
main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
|
358 |
target_element = main_content if main_content else soup.body
|
359 |
if not target_element: logger.warning(f"[Web Scrape Primary Parse] Could not find body or main content container for {url}"); return None
|
@@ -368,7 +336,6 @@ async def get_website_content(url: str) -> Optional[str]:
|
|
368 |
except Exception as e: logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True); return None
|
369 |
|
370 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
371 |
-
"""Fallback 1: Fetches website content using urltotext.com API."""
|
372 |
if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None
|
373 |
if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None
|
374 |
logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API")
|
@@ -396,7 +363,6 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
396 |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
397 |
|
398 |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
|
399 |
-
"""Fallback 2: Fetches website content using Scraper's Proxy Parser via RapidAPI."""
|
400 |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
|
401 |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
|
402 |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
|
@@ -430,7 +396,6 @@ async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Opti
|
|
430 |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
431 |
|
432 |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
|
433 |
-
"""Fallback 3: Fetches website content using AI Web Scraper via RapidAPI."""
|
434 |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
|
435 |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
|
436 |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API")
|
@@ -467,7 +432,6 @@ async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Opti
|
|
467 |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
468 |
|
469 |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]:
|
470 |
-
"""Generic function to run an Apify actor and get text content."""
|
471 |
if not url: logger.error(f"[{actor_name}] No URL provided"); return None
|
472 |
if not api_token: logger.error(f"[{actor_name}] API token missing."); return None
|
473 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
@@ -510,16 +474,14 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
510 |
except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
511 |
|
512 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
513 |
-
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
514 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, actor_name="Apify Crawler" )
|
515 |
|
516 |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]:
|
517 |
-
"""Fallback 5: Fetches website content using Apify Text Scraper Free."""
|
518 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, actor_name="Apify Text Scraper" )
|
519 |
# --- END OMITTED CONTENT FETCHING FUNCTIONS ---
|
520 |
|
521 |
|
522 |
-
# --- Summarization Functions (
|
523 |
|
524 |
# --- Prompts (Defined once, used by all models) ---
|
525 |
PROMPT_PARAGRAPH = (
|
@@ -554,38 +516,33 @@ PROMPT_POINTS = (
|
|
554 |
"Here is the text to summarise:"
|
555 |
)
|
556 |
|
557 |
-
# <<<
|
558 |
async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
559 |
-
"""Internal function to call Groq API (Primary). Returns (summary, error_message)."""
|
560 |
-
global GROQ_API_KEY,
|
561 |
if not _groq_enabled:
|
562 |
logger.error("[Groq Primary] Called but is disabled.");
|
563 |
-
return None, "Error: Primary AI service (Groq
|
564 |
-
logger.info(f"[Groq Primary] Generating {summary_type} summary using {
|
565 |
|
566 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
567 |
|
568 |
-
# Input Length Check
|
569 |
-
|
570 |
-
MAX_INPUT_LENGTH_GROQ = 20000
|
571 |
if len(text) > MAX_INPUT_LENGTH_GROQ:
|
572 |
logger.warning(f"[Groq Primary] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GROQ}). Truncating.");
|
573 |
text = text[:MAX_INPUT_LENGTH_GROQ] + "... (Content truncated)"
|
574 |
full_prompt = f"{prompt}\n\n{text}"
|
575 |
|
576 |
try:
|
577 |
-
|
578 |
-
|
579 |
-
api_key=GROQ_API_KEY,
|
580 |
-
timeout=httpx.Timeout(120.0, connect=10.0) # 120s read timeout, 10s connect
|
581 |
-
)
|
582 |
-
logger.info(f"[Groq Primary] Sending request to Groq ({GROQ_LLAMA_MODEL})...")
|
583 |
|
584 |
chat_completion = await groq_client.chat.completions.create(
|
585 |
messages=[ { "role": "user", "content": full_prompt } ],
|
586 |
-
model=
|
587 |
-
temperature=0.7, #
|
588 |
-
max_tokens=2048, #
|
589 |
top_p=1,
|
590 |
stream=False,
|
591 |
stop=None,
|
@@ -598,30 +555,27 @@ async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optio
|
|
598 |
logger.info(f"[Groq Primary] Success generating summary. Finish Reason: {finish_reason}. Output len: {len(summary)}");
|
599 |
return summary.strip(), None
|
600 |
else:
|
601 |
-
# Handle cases where response structure is unexpected or content is empty
|
602 |
logger.warning(f"[Groq Primary] Groq response structure unexpected or content empty. Response: {chat_completion.model_dump_json(indent=2)}")
|
603 |
finish_reason = chat_completion.choices[0].finish_reason if chat_completion.choices else 'N/A'
|
604 |
-
return None, f"Sorry, the primary AI model ({
|
605 |
|
606 |
except GroqError as ge:
|
607 |
-
|
608 |
-
|
609 |
-
error_msg = f"
|
610 |
-
|
611 |
-
elif ge.status_code == 429: error_msg = f"Sorry, primary AI model ({GROQ_LLAMA_MODEL}) is busy (Rate Limit). Try again."
|
612 |
return None, error_msg
|
613 |
except httpx.TimeoutException as te:
|
614 |
logger.error(f"[Groq Primary] Timeout during Groq API call: {te}")
|
615 |
-
return None, f"Sorry, the primary AI service ({
|
616 |
except httpx.RequestError as re:
|
617 |
logger.error(f"[Groq Primary] Network error during Groq API call: {re}")
|
618 |
-
return None, f"Sorry, couldn't connect to the primary AI service ({
|
619 |
except Exception as e:
|
620 |
logger.error(f"[Groq Primary] Unexpected error during Groq API call: {e}", exc_info=True);
|
621 |
-
return None, f"Sorry, an unexpected error occurred while using the primary AI service ({
|
622 |
|
623 |
|
624 |
-
# <<< CHANGE: Modified function to accept model_name >>>
|
625 |
async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[Optional[str], Optional[str]]:
|
626 |
"""Internal function to call Gemini API. Returns (summary, error_message)."""
|
627 |
global _gemini_api_enabled
|
@@ -631,22 +585,15 @@ async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[O
|
|
631 |
logger.info(f"[Gemini {model_name}] Generating {summary_type} summary using {model_name}. Input length: {len(text)}")
|
632 |
|
633 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
634 |
-
|
635 |
-
# Input Length Check (Gemini 1.5 has large context, but let's keep a practical limit)
|
636 |
-
MAX_INPUT_LENGTH_GEMINI = 900000 # Keep previous limit, seems reasonable
|
637 |
if len(text) > MAX_INPUT_LENGTH_GEMINI:
|
638 |
logger.warning(f"[Gemini {model_name}] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
|
639 |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
|
640 |
full_prompt = f"{prompt}\n\n{text}"
|
641 |
-
|
642 |
-
# Safety Settings (Block None)
|
643 |
safety_settings = { HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, }
|
644 |
-
# <<< CHANGE: Removed specific check for CIVIC_INTEGRITY, rely on hasattr instead >>>
|
645 |
-
# Handle potential new categories gracefully
|
646 |
for category_name in dir(HarmCategory):
|
647 |
if category_name.startswith('HARM_CATEGORY_') and getattr(HarmCategory, category_name) not in safety_settings:
|
648 |
safety_settings[getattr(HarmCategory, category_name)] = HarmBlockThreshold.BLOCK_NONE
|
649 |
-
|
650 |
logger.debug(f"[Gemini {model_name}] Using safety settings: { {k.name: v.name for k, v in safety_settings.items()} }")
|
651 |
|
652 |
try:
|
@@ -655,89 +602,77 @@ async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[O
|
|
655 |
logger.info(f"[Gemini {model_name}] Sending request to Gemini ({model_name})...")
|
656 |
request_options = {"timeout": 120}
|
657 |
response = await model.generate_content_async(
|
658 |
-
full_prompt,
|
659 |
-
|
660 |
-
safety_settings=safety_settings,
|
661 |
-
request_options=request_options
|
662 |
-
)
|
663 |
logger.info(f"[Gemini {model_name}] Received response from Gemini.")
|
664 |
-
|
665 |
-
# Check for immediate blocking reasons
|
666 |
if response.prompt_feedback and response.prompt_feedback.block_reason:
|
667 |
block_reason_str = getattr(response.prompt_feedback.block_reason, 'name', str(response.prompt_feedback.block_reason))
|
668 |
logger.warning(f"[Gemini {model_name}] Request blocked by API. Reason: {block_reason_str}");
|
669 |
return None, f"Sorry, the AI model ({model_name}) blocked the request (Reason: {block_reason_str})."
|
670 |
-
|
671 |
-
# Check candidate-level blocking and extract text safely
|
672 |
-
summary = None
|
673 |
-
finish_reason_str = 'UNKNOWN'
|
674 |
if response.candidates:
|
675 |
candidate = response.candidates[0]
|
676 |
finish_reason_name = getattr(candidate.finish_reason, 'name', None)
|
677 |
finish_reason_str = finish_reason_name or 'N/A'
|
678 |
-
|
679 |
if finish_reason_name == 'SAFETY':
|
680 |
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in candidate.safety_ratings])
|
681 |
logger.warning(f"[Gemini {model_name}] Candidate blocked due to SAFETY. Finish Reason: {finish_reason_str}. Ratings: [{safety_ratings_str}]")
|
682 |
return None, f"Sorry, the AI model ({model_name}) blocked the response due to safety filters ({finish_reason_str})."
|
683 |
elif finish_reason_name not in ['STOP', 'MAX_TOKENS', None]:
|
684 |
logger.warning(f"[Gemini {model_name}] Candidate finished with non-standard reason: {finish_reason_str}")
|
685 |
-
|
686 |
-
# Safely access content text
|
687 |
if candidate.content and candidate.content.parts:
|
688 |
summary = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
|
689 |
-
|
690 |
-
# Fallback check via response.text
|
691 |
if summary is None:
|
692 |
try: summary = response.text
|
693 |
-
except ValueError as e: logger.warning(f"[Gemini {model_name}] Error accessing response.text (likely blocked
|
694 |
-
|
695 |
if summary:
|
696 |
logger.info(f"[Gemini {model_name}] Success generating summary. Finish Reason: {finish_reason_str}. Output len: {len(summary)}");
|
697 |
return summary.strip(), None
|
698 |
else:
|
699 |
logger.warning(f"[Gemini {model_name}] Gemini returned empty summary or content was blocked. Final Finish Reason: {finish_reason_str}");
|
700 |
return None, f"Sorry, the AI model ({model_name}) did not provide a summary (Finish Reason: {finish_reason_str})."
|
701 |
-
|
702 |
except AttributeError as ae:
|
703 |
logger.error(f"[Gemini {model_name}] AttributeError during Gemini response processing: {ae}. SDK might be incompatible or response structure unexpected.", exc_info=True);
|
704 |
return None, f"Sorry, there was an issue processing the response from the AI service ({model_name})."
|
705 |
except Exception as e:
|
706 |
logger.error(f"[Gemini {model_name}] Unexpected error during Gemini API call: {e}", exc_info=True);
|
707 |
-
# Check for specific Gemini API errors if possible (e.g., AuthenticationFailed, RateLimitExceeded)
|
708 |
-
# This might require inspecting the error details or type.
|
709 |
error_msg = f"Sorry, an unexpected error occurred while using the AI service ({model_name})."
|
710 |
-
# Example: if "API key not valid" in str(e): error_msg = "Error: AI service (Gemini) API key is invalid."
|
711 |
return None, error_msg
|
712 |
|
713 |
-
|
714 |
-
# <<< CHANGE: Function remains the same, but is now the last fallback >>>
|
715 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
716 |
-
"""Internal function to call OpenRouter API (Final Fallback). Returns (summary, error_message)."""
|
717 |
-
global OPENROUTER_API_KEY,
|
718 |
if not _openrouter_fallback_enabled:
|
719 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
720 |
return None, "Error: Final fallback AI service (OpenRouter) not configured/available."
|
721 |
-
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {
|
722 |
|
723 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
724 |
-
|
725 |
-
# Input Length Check (Adjust if DeepSeek model limit is known, 100k is generous)
|
726 |
-
MAX_INPUT_LENGTH_OR = 100000
|
727 |
if len(text) > MAX_INPUT_LENGTH_OR:
|
728 |
-
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}) for {
|
729 |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
|
730 |
full_prompt = f"{prompt}\n\n{text}"
|
731 |
|
732 |
-
|
733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
735 |
-
api_timeouts = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=60.0)
|
736 |
response = None
|
737 |
|
738 |
try:
|
739 |
async with httpx.AsyncClient(timeout=api_timeouts) as client:
|
740 |
-
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({
|
741 |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
|
742 |
logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}")
|
743 |
|
@@ -749,116 +684,112 @@ async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str],
|
|
749 |
if message and isinstance(message, dict):
|
750 |
summary = message.get("content")
|
751 |
if summary: logger.info(f"[OpenRouter Fallback] Success via OpenRouter. Finish: {finish_reason}. Output len: {len(summary)}"); return summary.strip(), None
|
752 |
-
else: logger.warning(f"[OpenRouter Fallback] OpenRouter success but content empty. Finish: {finish_reason}. Resp: {data}"); return None, f"Sorry, the fallback AI model ({
|
753 |
else: logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Finish: {finish_reason}. Full: {data}"); return None, "Sorry, could not parse fallback AI response (message format)."
|
754 |
else:
|
755 |
error_details = data.get("error", {}); logger.error(f"[OpenRouter Fallback] Unexpected choices structure or error in response: {data.get('choices')}. Error: {error_details}. Full: {data}");
|
756 |
return None, f"Sorry, could not parse fallback AI response (choices structure or error: {error_details.get('message', 'Unknown')})."
|
757 |
except json.JSONDecodeError: logger.error(f"[OpenRouter Fallback] Failed JSON decode OpenRouter. Status:{response.status_code}. Resp:{response.text[:500]}"); return None, "Sorry, failed to understand fallback AI response."
|
758 |
except Exception as e: logger.error(f"[OpenRouter Fallback] Error processing OpenRouter success response: {e}", exc_info=True); return None, "Sorry, error processing fallback AI response."
|
759 |
-
elif response.status_code == 401: logger.error("[OpenRouter Fallback] API key invalid (401)."); return None, "Error: Fallback AI model configuration key is invalid."
|
760 |
-
elif response.status_code == 402: logger.error("[OpenRouter Fallback] Payment Required/Quota Exceeded (402)."); return None, f"Sorry, fallback AI service ({
|
761 |
-
elif response.status_code == 429: logger.warning("[OpenRouter Fallback] Rate Limit Exceeded (429)."); return None, f"Sorry, fallback AI model ({
|
762 |
-
elif response.status_code == 500: logger.error(f"[OpenRouter Fallback] Internal Server Error (500). Resp:{response.text[:500]}"); return None, f"Sorry, fallback AI service ({
|
763 |
else:
|
764 |
error_info = "";
|
765 |
try: error_info = response.json().get("error", {}).get("message", "")
|
766 |
except Exception: pass
|
767 |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}");
|
768 |
-
return None, f"Sorry, fallback AI service ({
|
769 |
|
770 |
-
except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}"); return None, f"Sorry, the fallback AI service ({
|
771 |
except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Sorry, there was an error connecting to the fallback AI model service."
|
772 |
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Sorry, an unexpected error occurred while using the fallback AI service."
|
773 |
|
774 |
|
775 |
-
# <<< CHANGE: Rewritten function for new model hierarchy >>>
|
776 |
async def generate_summary(text: str, summary_type: str) -> str:
|
777 |
"""
|
778 |
-
Generates summary using the
|
779 |
-
1. Groq (Llama 4
|
780 |
-
2. Gemini (2.5 Pro)
|
781 |
3. Gemini (2.0 Flash)
|
782 |
-
4. OpenRouter (DeepSeek)
|
783 |
Returns the summary text or a comprehensive error message.
|
784 |
"""
|
785 |
global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
786 |
-
global
|
787 |
|
788 |
-
logger.info("[Summary Generation] Starting process with
|
789 |
summary: Optional[str] = None
|
790 |
-
errors: Dict[str, Optional[str]] = {
|
791 |
-
"
|
792 |
-
"
|
793 |
"GeminiFlash": None,
|
794 |
-
"
|
795 |
}
|
796 |
|
797 |
-
# --- Attempt 1: Groq (
|
798 |
if _groq_enabled:
|
799 |
-
logger.info(f"[Summary Generation] Attempting 1: Groq ({
|
800 |
-
summary, errors["
|
801 |
if summary:
|
802 |
-
logger.info(f"[Summary Generation] Success with Groq ({
|
803 |
return summary
|
804 |
else:
|
805 |
-
logger.warning(f"[Summary Generation] Groq failed. Error: {errors['
|
806 |
else:
|
807 |
-
logger.warning("[Summary Generation] Groq is disabled or unavailable. Skipping.")
|
808 |
-
errors["
|
809 |
|
810 |
-
# --- Attempt 2: Gemini 2.5 Pro ---
|
811 |
if _gemini_api_enabled:
|
812 |
-
logger.info(f"[Summary Generation] Attempting 2: Gemini ({
|
813 |
-
summary, errors["
|
814 |
if summary:
|
815 |
-
logger.info(f"[Summary Generation] Success with Gemini ({
|
816 |
return summary
|
817 |
else:
|
818 |
-
logger.warning(f"[Summary Generation] Gemini Pro failed. Error: {errors['
|
819 |
else:
|
820 |
-
logger.warning("[Summary Generation] Gemini API is disabled or unavailable. Skipping Gemini Pro & Flash.")
|
821 |
-
errors["
|
822 |
-
errors["GeminiFlash"] = "Service disabled/unavailable."
|
823 |
|
824 |
# --- Attempt 3: Gemini 2.0 Flash ---
|
825 |
-
if _gemini_api_enabled and errors["GeminiFlash"] is None:
|
826 |
logger.info(f"[Summary Generation] Attempting 3: Gemini ({GEMINI_FLASH_MODEL})")
|
827 |
summary, errors["GeminiFlash"] = await _call_gemini(text, summary_type, GEMINI_FLASH_MODEL)
|
828 |
if summary:
|
829 |
logger.info(f"[Summary Generation] Success with Gemini ({GEMINI_FLASH_MODEL}).")
|
830 |
return summary
|
831 |
else:
|
832 |
-
logger.warning(f"[Summary Generation] Gemini Flash failed. Error: {errors['GeminiFlash']}. Proceeding to OpenRouter.")
|
833 |
-
elif errors["GeminiFlash"] is None:
|
834 |
-
logger.warning("[Summary Generation] Skipping Gemini Flash (API was disabled).")
|
835 |
errors["GeminiFlash"] = "Service disabled/unavailable."
|
836 |
|
837 |
-
|
838 |
-
# --- Attempt 4: OpenRouter (Final Fallback) ---
|
839 |
if _openrouter_fallback_enabled:
|
840 |
-
logger.info(f"[Summary Generation] Attempting 4: OpenRouter ({
|
841 |
-
summary, errors["
|
842 |
if summary:
|
843 |
-
logger.info(f"[Summary Generation] Success with OpenRouter ({
|
844 |
return summary
|
845 |
else:
|
846 |
-
logger.error(f"[Summary Generation] OpenRouter (Final Fallback) also failed. Error: {errors['
|
847 |
else:
|
848 |
-
logger.error("[Summary Generation] OpenRouter fallback is disabled or unavailable. Cannot proceed.")
|
849 |
-
errors["
|
850 |
|
851 |
# --- All Attempts Failed ---
|
852 |
logger.error("[Summary Generation] All summarization models failed.")
|
853 |
-
# Construct a final error message
|
854 |
error_details = "\n".join([f"- {model}: {err}" for model, err in errors.items() if err])
|
855 |
return f"Sorry, I couldn't generate a summary after trying all available AI models.\nDetails:\n{error_details}"
|
856 |
|
857 |
|
858 |
# --- Main Processing Logic ---
|
859 |
-
# (process_summary_task remains UNCHANGED in its core logic
|
860 |
-
# but
|
861 |
-
# Omitted here for brevity, but should be included in the final file.)
|
862 |
# --- START OMITTED process_summary_task ---
|
863 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
864 |
"""Handles the entire process: fetching content (with ALL fallbacks) and summarizing."""
|
@@ -897,12 +828,12 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
897 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
898 |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
899 |
else:
|
900 |
-
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN, _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists
|
901 |
|
902 |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 1 (Direct Fetch + BS4)..."); content = await get_website_content(url)
|
903 |
if not content:
|
904 |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
|
905 |
-
if _urltotext_key_exists: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing'); content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
906 |
else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
|
907 |
if not content:
|
908 |
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
|
@@ -932,8 +863,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
932 |
except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
|
933 |
|
934 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
935 |
-
|
936 |
-
final_summary = await generate_summary(content, summary_type)
|
937 |
|
938 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
939 |
user_feedback_message = final_summary
|
@@ -978,8 +908,7 @@ async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit:
|
|
978 |
|
979 |
# --- Telegram Handlers ---
|
980 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler
|
981 |
-
# remain UNCHANGED
|
982 |
-
# to check the new availability flags. Omitted here for brevity, but include in final file.)
|
983 |
# --- START OMITTED TELEGRAM HANDLERS ---
|
984 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
985 |
user = update.effective_user; mention = user.mention_html()
|
@@ -996,7 +925,7 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
|
|
996 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
997 |
"3. Click the button for your choice.\n"
|
998 |
"4. Wait while I fetch the content and generate the summary!\n\n"
|
999 |
-
"βοΈ I try multiple methods to get content, especially for tricky websites or YouTube videos without standard transcripts. I then use a sequence of AI models (Llama, Gemini Pro, Gemini Flash, DeepSeek) to summarise.\n\n" #
|
1000 |
"**Commands:**\n"
|
1001 |
"`/start` - Display the welcome message\n"
|
1002 |
"`/help` - Show this help message" )
|
@@ -1054,7 +983,7 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1054 |
|
1055 |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
|
1056 |
|
1057 |
-
#
|
1058 |
global TELEGRAM_TOKEN, _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
1059 |
if not TELEGRAM_TOKEN:
|
1060 |
logger.critical("TELEGRAM_TOKEN missing in callback!")
|
@@ -1066,16 +995,10 @@ async def handle_summary_type_callback(update: Update, context: ContextTypes.DEF
|
|
1066 |
try: await query.edit_message_text(text="β AI configuration error: No summarization models available.")
|
1067 |
except Exception: pass
|
1068 |
return
|
1069 |
-
# Log warnings if
|
1070 |
-
if not _groq_enabled: logger.warning("Primary AI (Groq) is unavailable
|
1071 |
-
if not _gemini_api_enabled: logger.warning("Gemini API is unavailable
|
1072 |
-
if not _openrouter_fallback_enabled
|
1073 |
-
# This case should already be caught above, but as a safeguard
|
1074 |
-
logger.critical("No models available at all!")
|
1075 |
-
try: await query.edit_message_text(text="β AI configuration error: No summarization models available.")
|
1076 |
-
except Exception: pass
|
1077 |
-
return
|
1078 |
-
|
1079 |
|
1080 |
logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}")
|
1081 |
asyncio.create_task(
|
@@ -1097,10 +1020,10 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
|
|
1097 |
|
1098 |
|
1099 |
# --- Application Setup & Web Framework ---
|
1100 |
-
# (setup_bot_config, lifespan,
|
1101 |
-
# remain
|
1102 |
# Omitted here for brevity, include in final file.)
|
1103 |
-
# --- START OMITTED APP SETUP/WEB FRAMEWORK ---
|
1104 |
async def setup_bot_config() -> Application:
|
1105 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
1106 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
@@ -1170,10 +1093,35 @@ async def lifespan(app: Starlette):
|
|
1170 |
else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.")
|
1171 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1173 |
async def health_check(request: Request) -> PlainTextResponse:
|
1174 |
"""Simple health check endpoint."""
|
1175 |
-
|
1176 |
-
global GROQ_LLAMA_MODEL, GEMINI_PRO_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_MODEL, APIFY_ACTOR_ID
|
1177 |
global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
1178 |
global _apify_token_exists, _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY
|
1179 |
|
@@ -1191,15 +1139,15 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1191 |
bot_status = f"Error checking status: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
|
1192 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1193 |
|
1194 |
-
# <<<
|
1195 |
return PlainTextResponse(
|
1196 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1197 |
f"---\n"
|
1198 |
-
f"Summarizer Priority:\n"
|
1199 |
-
f"1. Groq API: {
|
1200 |
-
f"2. Gemini API
|
1201 |
-
f"3. Gemini API
|
1202 |
-
f"4. OpenRouter API: {
|
1203 |
f"---\n"
|
1204 |
f"Content Fetching Status:\n"
|
1205 |
f"YT Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
|
@@ -1210,31 +1158,6 @@ async def health_check(request: Request) -> PlainTextResponse:
|
|
1210 |
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
|
1211 |
)
|
1212 |
|
1213 |
-
async def telegram_webhook(request: Request) -> Response:
|
1214 |
-
"""Handles incoming updates from Telegram."""
|
1215 |
-
global WEBHOOK_SECRET
|
1216 |
-
if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
|
1217 |
-
if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503)
|
1218 |
-
if WEBHOOK_SECRET:
|
1219 |
-
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
1220 |
-
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
|
1221 |
-
try:
|
1222 |
-
update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot)
|
1223 |
-
logger.debug(f"Processing update_id: {update.update_id} via webhook"); await ptb_app.process_update(update)
|
1224 |
-
return Response(status_code=200)
|
1225 |
-
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1226 |
-
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200)
|
1227 |
-
|
1228 |
-
# --- Starlette App Definition ---
|
1229 |
-
app = Starlette(
|
1230 |
-
debug=False,
|
1231 |
-
lifespan=lifespan,
|
1232 |
-
routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ]
|
1233 |
-
)
|
1234 |
-
logger.info("Starlette ASGI application created with health check and webhook routes.")
|
1235 |
-
# --- END OMITTED APP SETUP/WEB FRAMEWORK ---
|
1236 |
-
|
1237 |
-
|
1238 |
# --- Development Server (if run directly) ---
|
1239 |
if __name__ == '__main__':
|
1240 |
import uvicorn
|
|
|
1 |
+
# main.py (Updated for Specific April 2025 Models: Llama 4 Scout & DeepSeek V3 Free)
|
2 |
import os
|
3 |
import re
|
4 |
import logging
|
|
|
7 |
import html
|
8 |
import contextlib
|
9 |
import traceback
|
10 |
+
import urllib.parse
|
11 |
from typing import Optional, Dict, Any, Tuple
|
12 |
|
13 |
# --- Frameworks ---
|
|
|
45 |
try:
|
46 |
import google.generativeai as genai
|
47 |
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
48 |
+
_gemini_sdk_available = True
|
49 |
except ImportError:
|
50 |
genai = None
|
51 |
HarmCategory = None
|
52 |
HarmBlockThreshold = None
|
53 |
_gemini_sdk_available = False
|
|
|
54 |
|
55 |
+
# --- Groq SDK ---
|
56 |
try:
|
57 |
from groq import Groq, GroqError
|
58 |
_groq_sdk_available = True
|
|
|
71 |
logging.getLogger('uvicorn').setLevel(logging.INFO)
|
72 |
logging.getLogger('starlette').setLevel(logging.INFO)
|
73 |
if _gemini_sdk_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING)
|
74 |
+
if _groq_sdk_available: logging.getLogger("groq").setLevel(logging.INFO)
|
75 |
logger = logging.getLogger(__name__)
|
76 |
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}")
|
77 |
if not _gemini_sdk_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.")
|
78 |
+
if not _groq_sdk_available: logger.warning("groq library not found. Groq functionality disabled.")
|
79 |
|
80 |
|
81 |
# --- Global variable for PTB app ---
|
|
|
90 |
return value
|
91 |
|
92 |
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
|
93 |
+
GROQ_API_KEY = get_secret('GROQ_API_KEY') # For Llama 4
|
94 |
+
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # For Gemini 2.5 Pro and 2.0 Flash
|
95 |
+
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # For DeepSeek
|
96 |
+
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY')
|
97 |
+
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY')
|
98 |
+
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
|
99 |
+
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY')
|
100 |
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
|
101 |
|
102 |
+
# <<< CHANGE: Using EXACT Model Identifiers from User Docs >>>
|
103 |
# Model Priority:
|
104 |
+
# 1. Groq Llama 4 Scout
|
105 |
+
# 2. Gemini 2.5 Pro Exp
|
106 |
# 3. Gemini 2.0 Flash
|
107 |
+
# 4. OpenRouter DeepSeek V3 Free
|
108 |
+
GROQ_LLAMA4_MODEL = os.environ.get("GROQ_LLAMA4_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct") # <<< Specific Llama 4 model
|
109 |
+
GEMINI_PRO_EXP_MODEL = os.environ.get("GEMINI_PRO_EXP_MODEL", "gemini-2.5-pro-exp-03-25")
|
110 |
+
GEMINI_FLASH_MODEL = os.environ.get("GEMINI_FLASH_MODEL", "gemini-2.0-flash-001")
|
111 |
+
OPENROUTER_DEEPSEEK_MODEL = os.environ.get("OPENROUTER_DEEPSEEK_MODEL", "deepseek/deepseek-chat-v3-0324:free") # <<< Specific DeepSeek model
|
112 |
|
113 |
+
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # YT Default
|
114 |
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Scrape Fallback 4
|
115 |
APIFY_TEXT_SCRAPER_ACTOR_ID = "karamelo/text-scraper-free" # Scrape Fallback 5
|
116 |
|
|
|
119 |
|
120 |
# Summarizer Availability Checks
|
121 |
_groq_enabled = _groq_sdk_available and bool(GROQ_API_KEY)
|
122 |
+
_gemini_api_enabled = _gemini_sdk_available and bool(GEMINI_API_KEY)
|
123 |
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
|
124 |
|
125 |
if not _groq_enabled:
|
126 |
+
if not _groq_sdk_available: logger.error("β ERROR: groq library missing. Groq (Llama 4) disabled.")
|
127 |
+
elif not GROQ_API_KEY: logger.error("β ERROR: GROQ_API_KEY not found. Primary summarization (Groq Llama 4) will fail.")
|
128 |
if not _gemini_api_enabled:
|
129 |
if not _gemini_sdk_available: logger.warning("β οΈ WARNING: google-generativeai library missing. Gemini disabled.")
|
130 |
+
elif not GEMINI_API_KEY: logger.warning("β οΈ WARNING: GEMINI_API_KEY not found. Gemini summarization fallbacks (2.5 Pro / 2.0 Flash) will fail.")
|
131 |
if not _openrouter_fallback_enabled: logger.warning("β οΈ WARNING: OPENROUTER_API_KEY not found. Final fallback summarization (DeepSeek) will fail.")
|
132 |
|
133 |
if not _groq_enabled and not _gemini_api_enabled and not _openrouter_fallback_enabled:
|
134 |
logger.critical("β FATAL: No summarization models are configured or enabled. Bot cannot function.")
|
|
|
|
|
135 |
elif not _groq_enabled:
|
136 |
+
logger.warning("β οΈ Primary summarizer (Groq Llama 4) is disabled. Will start with Gemini 2.5 Pro.")
|
137 |
|
138 |
# Scraper Availability Checks (Warnings only)
|
139 |
if not RAPIDAPI_KEY: logger.warning("β οΈ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks (2 & 3) will be unavailable.")
|
|
|
143 |
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.")
|
144 |
|
145 |
logger.info("Secret loading and configuration check finished.")
|
146 |
+
logger.info(f"Summarizer 1 (Groq): {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'}")
|
147 |
+
logger.info(f"Summarizer 2 (Gemini Pro Exp): {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}")
|
148 |
+
logger.info(f"Summarizer 3 (Gemini Flash): {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}")
|
149 |
+
logger.info(f"Summarizer 4 (OpenRouter): {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
|
150 |
logger.info(f"Using Apify Actor (YT Default): {APIFY_ACTOR_ID}")
|
151 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 4): {APIFY_CRAWLER_ACTOR_ID}")
|
152 |
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_TEXT_SCRAPER_ACTOR_ID}")
|
153 |
|
154 |
+
# Flags for scraper key existence
|
155 |
_apify_token_exists = bool(APIFY_API_TOKEN)
|
156 |
_urltotext_key_exists = bool(URLTOTEXT_API_KEY)
|
157 |
_rapidapi_key_exists = bool(RAPIDAPI_KEY)
|
158 |
|
159 |
# --- Configure APIs ---
|
160 |
+
if _gemini_api_enabled:
|
161 |
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.")
|
162 |
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_api_enabled = False
|
163 |
|
164 |
+
# Groq client is initialized per-request in the _call_groq function
|
165 |
|
166 |
# --- Retry Decorator ---
|
167 |
@retry( stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=15), retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True )
|
|
|
184 |
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
|
185 |
else: logger.warning(f"Could not extract YT ID from {url}"); return None
|
186 |
|
|
|
187 |
# --- Content Fetching Functions ---
|
188 |
# (These functions: get_transcript_via_supadata, get_transcript_via_apify, get_youtube_transcript,
|
189 |
# fetch_url_content_for_scrape, get_website_content, get_website_content_via_api,
|
190 |
# get_website_content_via_scrapers_proxy, get_website_content_via_ai_web_scraper,
|
191 |
# _run_apify_actor_for_web_content, get_website_content_via_apify_crawler,
|
192 |
# get_website_content_via_apify_text_scraper remain UNCHANGED. They are omitted here for brevity
|
193 |
+
# but MUST be included in the final main.py file)
|
194 |
# --- START OMITTED CONTENT FETCHING FUNCTIONS ---
|
195 |
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
|
196 |
if not video_id: logger.error("[Supadata] No video_id provided"); return None
|
|
|
224 |
|
225 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
226 |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor."""
|
227 |
+
global APIFY_ACTOR_ID
|
228 |
if not video_url: logger.error("[Apify YT] No video_url provided"); return None
|
229 |
if not api_token: logger.error("[Apify YT] API token missing."); return None
|
230 |
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
|
|
|
231 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
|
232 |
params = {"token": api_token}
|
233 |
+
payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
headers = {"Content-Type": "application/json"}
|
|
|
235 |
try:
|
236 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
237 |
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
|
238 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
|
239 |
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
|
|
|
240 |
if response.status_code == 200:
|
241 |
try:
|
242 |
results = response.json()
|
243 |
if isinstance(results, list) and len(results) > 0:
|
244 |
+
item = results[0]; content = None
|
|
|
|
|
245 |
if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
|
246 |
elif "text" in item and isinstance(item["text"], str): content = item["text"]
|
247 |
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
|
248 |
+
elif "captions" in item and isinstance(item["captions"], list):
|
249 |
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
|
250 |
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
|
|
|
251 |
if content and isinstance(content, str): logger.info(f"[Apify YT] Success via REST for {video_url}. Length: {len(content)}"); return content.strip()
|
252 |
else: logger.warning(f"[Apify YT] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None
|
253 |
else: logger.warning(f"[Apify YT] Actor success but dataset was empty for {video_url}. Response: {results}"); return None
|
|
|
263 |
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
|
264 |
|
265 |
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
|
266 |
+
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
|
267 |
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
|
268 |
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
|
269 |
transcript_text = None
|
|
|
270 |
logger.info("[Primary YT] Attempting youtube-transcript-api...")
|
271 |
try:
|
272 |
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
|
|
|
276 |
except NoTranscriptFound: logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
|
277 |
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
|
278 |
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
|
|
|
|
|
279 |
if transcript_text is None:
|
280 |
logger.info("[Fallback YT 1] Trying Supadata API...")
|
281 |
if SUPADATA_API_KEY:
|
|
|
283 |
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
|
284 |
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
|
285 |
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
|
|
|
|
|
286 |
if transcript_text is None:
|
287 |
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
|
288 |
+
if _apify_token_exists:
|
289 |
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
|
290 |
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
|
291 |
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
|
292 |
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
|
|
|
|
|
293 |
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
|
294 |
return transcript_text
|
295 |
|
296 |
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
|
|
|
297 |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
|
298 |
try:
|
299 |
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
300 |
logger.debug(f"[Web Scrape Direct] Sending GET request to {url}")
|
301 |
response = await client.get(url)
|
302 |
logger.debug(f"[Web Scrape Direct] Received response {response.status_code} from {url}")
|
303 |
+
response.raise_for_status()
|
304 |
content_type = response.headers.get('content-type', '').lower()
|
305 |
+
if 'html' not in content_type: logger.warning(f"[Web Scrape Direct] Non-HTML content type received from {url}: {content_type}"); return None
|
|
|
|
|
306 |
try: return response.text
|
307 |
except Exception as e: logger.error(f"[Web Scrape Direct] Error decoding response text for {url}: {e}"); return None
|
308 |
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Direct] HTTP error {e.response.status_code} fetching {url}: {e}")
|
|
|
313 |
return None
|
314 |
|
315 |
async def get_website_content(url: str) -> Optional[str]:
|
|
|
316 |
if not url: logger.error("[Web Scrape Primary] No URL provided"); return None
|
317 |
logger.info(f"[Web Scrape Primary] Attempting direct fetch and parse for: {url}")
|
318 |
html_content = await fetch_url_content_for_scrape(url)
|
319 |
+
if not html_content: logger.warning(f"[Web Scrape Primary] Direct fetch failed for {url}."); return None
|
|
|
|
|
320 |
try:
|
321 |
def parse_html(content: str) -> Optional[str]:
|
322 |
try:
|
323 |
soup = BeautifulSoup(content, DEFAULT_PARSER)
|
324 |
+
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area"]): element.extract()
|
|
|
325 |
main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post', re.I))
|
326 |
target_element = main_content if main_content else soup.body
|
327 |
if not target_element: logger.warning(f"[Web Scrape Primary Parse] Could not find body or main content container for {url}"); return None
|
|
|
336 |
except Exception as e: logger.error(f"[Web Scrape Primary] Unexpected error during parsing process for {url}: {e}", exc_info=True); return None
|
337 |
|
338 |
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
|
339 |
if not url: logger.error("[Web Scrape Fallback 1] No URL"); return None
|
340 |
if not api_key: logger.error("[Web Scrape Fallback 1] urltotext.com API key missing."); return None
|
341 |
logger.info(f"[Web Scrape Fallback 1] Attempting fetch for: {url} using urltotext.com API")
|
|
|
363 |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
364 |
|
365 |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
|
|
|
366 |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
|
367 |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
|
368 |
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
|
|
|
396 |
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
397 |
|
398 |
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
|
|
|
399 |
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
|
400 |
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
|
401 |
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using AI Web Scraper API")
|
|
|
432 |
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
|
433 |
|
434 |
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str) -> Optional[str]:
|
|
|
435 |
if not url: logger.error(f"[{actor_name}] No URL provided"); return None
|
436 |
if not api_token: logger.error(f"[{actor_name}] API token missing."); return None
|
437 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
|
|
474 |
except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
475 |
|
476 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
|
|
477 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID, actor_name="Apify Crawler" )
|
478 |
|
479 |
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]:
|
|
|
480 |
return await _run_apify_actor_for_web_content( url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID, actor_name="Apify Text Scraper" )
|
481 |
# --- END OMITTED CONTENT FETCHING FUNCTIONS ---
|
482 |
|
483 |
|
484 |
+
# --- Summarization Functions (Using Specific April 2025 Models) ---
|
485 |
|
486 |
# --- Prompts (Defined once, used by all models) ---
|
487 |
PROMPT_PARAGRAPH = (
|
|
|
516 |
"Here is the text to summarise:"
|
517 |
)
|
518 |
|
519 |
+
# <<< Uses the specific GROQ_LLAMA4_MODEL constant >>>
|
520 |
async def _call_groq(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
521 |
+
"""Internal function to call Groq API (Primary - Llama 4 Scout). Returns (summary, error_message)."""
|
522 |
+
global GROQ_API_KEY, GROQ_LLAMA4_MODEL, _groq_enabled
|
523 |
if not _groq_enabled:
|
524 |
logger.error("[Groq Primary] Called but is disabled.");
|
525 |
+
return None, f"Error: Primary AI service (Groq {GROQ_LLAMA4_MODEL}) not configured/available."
|
526 |
+
logger.info(f"[Groq Primary] Generating {summary_type} summary using {GROQ_LLAMA4_MODEL}. Input length: {len(text)}")
|
527 |
|
528 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
529 |
|
530 |
+
# Input Length Check for Llama 4 Scout (16k context? Be conservative)
|
531 |
+
MAX_INPUT_LENGTH_GROQ = 40000 # ~13k tokens
|
|
|
532 |
if len(text) > MAX_INPUT_LENGTH_GROQ:
|
533 |
logger.warning(f"[Groq Primary] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_GROQ}). Truncating.");
|
534 |
text = text[:MAX_INPUT_LENGTH_GROQ] + "... (Content truncated)"
|
535 |
full_prompt = f"{prompt}\n\n{text}"
|
536 |
|
537 |
try:
|
538 |
+
groq_client = Groq( api_key=GROQ_API_KEY, timeout=httpx.Timeout(120.0, connect=10.0) )
|
539 |
+
logger.info(f"[Groq Primary] Sending request to Groq ({GROQ_LLAMA4_MODEL})...")
|
|
|
|
|
|
|
|
|
540 |
|
541 |
chat_completion = await groq_client.chat.completions.create(
|
542 |
messages=[ { "role": "user", "content": full_prompt } ],
|
543 |
+
model=GROQ_LLAMA4_MODEL, # <<< Use specific Llama 4 model name
|
544 |
+
temperature=0.7, # <<< Groq default is 1, adjust if needed
|
545 |
+
max_tokens=2048, # <<< Groq default is 1024, adjust if needed for longer summaries
|
546 |
top_p=1,
|
547 |
stream=False,
|
548 |
stop=None,
|
|
|
555 |
logger.info(f"[Groq Primary] Success generating summary. Finish Reason: {finish_reason}. Output len: {len(summary)}");
|
556 |
return summary.strip(), None
|
557 |
else:
|
|
|
558 |
logger.warning(f"[Groq Primary] Groq response structure unexpected or content empty. Response: {chat_completion.model_dump_json(indent=2)}")
|
559 |
finish_reason = chat_completion.choices[0].finish_reason if chat_completion.choices else 'N/A'
|
560 |
+
return None, f"Sorry, the primary AI model ({GROQ_LLAMA4_MODEL}) provided an empty or invalid response (Finish Reason: {finish_reason})."
|
561 |
|
562 |
except GroqError as ge:
|
563 |
+
logger.error(f"[Groq Primary] Groq API error: {ge.status_code} - {ge.message}", exc_info=False)
|
564 |
+
error_msg = f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) failed. API Error: {ge.status_code}."
|
565 |
+
if ge.status_code == 401: error_msg = f"Error: Primary AI service (Groq {GROQ_LLAMA4_MODEL}) API key is invalid."
|
566 |
+
elif ge.status_code == 429: error_msg = f"Sorry, primary AI model ({GROQ_LLAMA4_MODEL}) is busy (Rate Limit). Try again."
|
|
|
567 |
return None, error_msg
|
568 |
except httpx.TimeoutException as te:
|
569 |
logger.error(f"[Groq Primary] Timeout during Groq API call: {te}")
|
570 |
+
return None, f"Sorry, the primary AI service ({GROQ_LLAMA4_MODEL}) timed out."
|
571 |
except httpx.RequestError as re:
|
572 |
logger.error(f"[Groq Primary] Network error during Groq API call: {re}")
|
573 |
+
return None, f"Sorry, couldn't connect to the primary AI service ({GROQ_LLAMA4_MODEL})."
|
574 |
except Exception as e:
|
575 |
logger.error(f"[Groq Primary] Unexpected error during Groq API call: {e}", exc_info=True);
|
576 |
+
return None, f"Sorry, an unexpected error occurred while using the primary AI service ({GROQ_LLAMA4_MODEL})."
|
577 |
|
578 |
|
|
|
579 |
async def _call_gemini(text: str, summary_type: str, model_name: str) -> Tuple[Optional[str], Optional[str]]:
|
580 |
"""Internal function to call Gemini API. Returns (summary, error_message)."""
|
581 |
global _gemini_api_enabled
|
|
|
585 |
logger.info(f"[Gemini {model_name}] Generating {summary_type} summary using {model_name}. Input length: {len(text)}")
|
586 |
|
587 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
588 |
+
MAX_INPUT_LENGTH_GEMINI = 900000
|
|
|
|
|
589 |
if len(text) > MAX_INPUT_LENGTH_GEMINI:
|
590 |
logger.warning(f"[Gemini {model_name}] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
|
591 |
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
|
592 |
full_prompt = f"{prompt}\n\n{text}"
|
|
|
|
|
593 |
safety_settings = { HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, }
|
|
|
|
|
594 |
for category_name in dir(HarmCategory):
|
595 |
if category_name.startswith('HARM_CATEGORY_') and getattr(HarmCategory, category_name) not in safety_settings:
|
596 |
safety_settings[getattr(HarmCategory, category_name)] = HarmBlockThreshold.BLOCK_NONE
|
|
|
597 |
logger.debug(f"[Gemini {model_name}] Using safety settings: { {k.name: v.name for k, v in safety_settings.items()} }")
|
598 |
|
599 |
try:
|
|
|
602 |
logger.info(f"[Gemini {model_name}] Sending request to Gemini ({model_name})...")
|
603 |
request_options = {"timeout": 120}
|
604 |
response = await model.generate_content_async(
|
605 |
+
full_prompt, generation_config=genai.types.GenerationConfig(),
|
606 |
+
safety_settings=safety_settings, request_options=request_options )
|
|
|
|
|
|
|
607 |
logger.info(f"[Gemini {model_name}] Received response from Gemini.")
|
|
|
|
|
608 |
if response.prompt_feedback and response.prompt_feedback.block_reason:
|
609 |
block_reason_str = getattr(response.prompt_feedback.block_reason, 'name', str(response.prompt_feedback.block_reason))
|
610 |
logger.warning(f"[Gemini {model_name}] Request blocked by API. Reason: {block_reason_str}");
|
611 |
return None, f"Sorry, the AI model ({model_name}) blocked the request (Reason: {block_reason_str})."
|
612 |
+
summary = None; finish_reason_str = 'UNKNOWN'
|
|
|
|
|
|
|
613 |
if response.candidates:
|
614 |
candidate = response.candidates[0]
|
615 |
finish_reason_name = getattr(candidate.finish_reason, 'name', None)
|
616 |
finish_reason_str = finish_reason_name or 'N/A'
|
|
|
617 |
if finish_reason_name == 'SAFETY':
|
618 |
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in candidate.safety_ratings])
|
619 |
logger.warning(f"[Gemini {model_name}] Candidate blocked due to SAFETY. Finish Reason: {finish_reason_str}. Ratings: [{safety_ratings_str}]")
|
620 |
return None, f"Sorry, the AI model ({model_name}) blocked the response due to safety filters ({finish_reason_str})."
|
621 |
elif finish_reason_name not in ['STOP', 'MAX_TOKENS', None]:
|
622 |
logger.warning(f"[Gemini {model_name}] Candidate finished with non-standard reason: {finish_reason_str}")
|
|
|
|
|
623 |
if candidate.content and candidate.content.parts:
|
624 |
summary = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
|
|
|
|
|
625 |
if summary is None:
|
626 |
try: summary = response.text
|
627 |
+
except ValueError as e: logger.warning(f"[Gemini {model_name}] Error accessing response.text (likely blocked): {e}"); summary = None
|
|
|
628 |
if summary:
|
629 |
logger.info(f"[Gemini {model_name}] Success generating summary. Finish Reason: {finish_reason_str}. Output len: {len(summary)}");
|
630 |
return summary.strip(), None
|
631 |
else:
|
632 |
logger.warning(f"[Gemini {model_name}] Gemini returned empty summary or content was blocked. Final Finish Reason: {finish_reason_str}");
|
633 |
return None, f"Sorry, the AI model ({model_name}) did not provide a summary (Finish Reason: {finish_reason_str})."
|
|
|
634 |
except AttributeError as ae:
|
635 |
logger.error(f"[Gemini {model_name}] AttributeError during Gemini response processing: {ae}. SDK might be incompatible or response structure unexpected.", exc_info=True);
|
636 |
return None, f"Sorry, there was an issue processing the response from the AI service ({model_name})."
|
637 |
except Exception as e:
|
638 |
logger.error(f"[Gemini {model_name}] Unexpected error during Gemini API call: {e}", exc_info=True);
|
|
|
|
|
639 |
error_msg = f"Sorry, an unexpected error occurred while using the AI service ({model_name})."
|
|
|
640 |
return None, error_msg
|
641 |
|
642 |
+
# <<< Uses the specific OPENROUTER_DEEPSEEK_MODEL constant >>>
|
|
|
643 |
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
|
644 |
+
"""Internal function to call OpenRouter API (Final Fallback - DeepSeek V3 Free). Returns (summary, error_message)."""
|
645 |
+
global OPENROUTER_API_KEY, OPENROUTER_DEEPSEEK_MODEL, _openrouter_fallback_enabled
|
646 |
if not _openrouter_fallback_enabled:
|
647 |
logger.error("[OpenRouter Fallback] Called but is disabled.");
|
648 |
return None, "Error: Final fallback AI service (OpenRouter) not configured/available."
|
649 |
+
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_DEEPSEEK_MODEL}. Input length: {len(text)}")
|
650 |
|
651 |
prompt = PROMPT_PARAGRAPH if summary_type == "paragraph" else PROMPT_POINTS
|
652 |
+
MAX_INPUT_LENGTH_OR = 100000 # DeepSeek V3 has 131k context, 100k chars is safe
|
|
|
|
|
653 |
if len(text) > MAX_INPUT_LENGTH_OR:
|
654 |
+
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}) for {OPENROUTER_DEEPSEEK_MODEL}. Truncating.");
|
655 |
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
|
656 |
full_prompt = f"{prompt}\n\n{text}"
|
657 |
|
658 |
+
# Use the direct httpx call as before, ensuring the correct model name is in the payload
|
659 |
+
headers = {
|
660 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
661 |
+
"Content-Type": "application/json",
|
662 |
+
"HTTP-Referer": os.environ.get("YOUR_SITE_URL", "https://github.com/your-repo"), # Optional header
|
663 |
+
"X-Title": os.environ.get("YOUR_SITE_NAME", "TelegramSummariserBot") # Optional header
|
664 |
+
}
|
665 |
+
payload = {
|
666 |
+
"model": OPENROUTER_DEEPSEEK_MODEL, # <<< Use specific DeepSeek model name
|
667 |
+
"messages": [{"role": "user", "content": full_prompt}],
|
668 |
+
}
|
669 |
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
|
670 |
+
api_timeouts = httpx.Timeout(connect=10.0, read=60.0, write=10.0, pool=60.0)
|
671 |
response = None
|
672 |
|
673 |
try:
|
674 |
async with httpx.AsyncClient(timeout=api_timeouts) as client:
|
675 |
+
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}) with read timeout {api_timeouts.read}s...")
|
676 |
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
|
677 |
logger.info(f"[OpenRouter Fallback] Received response from OpenRouter. Status code: {response.status_code}")
|
678 |
|
|
|
684 |
if message and isinstance(message, dict):
|
685 |
summary = message.get("content")
|
686 |
if summary: logger.info(f"[OpenRouter Fallback] Success via OpenRouter. Finish: {finish_reason}. Output len: {len(summary)}"); return summary.strip(), None
|
687 |
+
else: logger.warning(f"[OpenRouter Fallback] OpenRouter success but content empty. Finish: {finish_reason}. Resp: {data}"); return None, f"Sorry, the fallback AI model ({OPENROUTER_DEEPSEEK_MODEL}) returned an empty summary (Finish: {finish_reason})."
|
688 |
else: logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Finish: {finish_reason}. Full: {data}"); return None, "Sorry, could not parse fallback AI response (message format)."
|
689 |
else:
|
690 |
error_details = data.get("error", {}); logger.error(f"[OpenRouter Fallback] Unexpected choices structure or error in response: {data.get('choices')}. Error: {error_details}. Full: {data}");
|
691 |
return None, f"Sorry, could not parse fallback AI response (choices structure or error: {error_details.get('message', 'Unknown')})."
|
692 |
except json.JSONDecodeError: logger.error(f"[OpenRouter Fallback] Failed JSON decode OpenRouter. Status:{response.status_code}. Resp:{response.text[:500]}"); return None, "Sorry, failed to understand fallback AI response."
|
693 |
except Exception as e: logger.error(f"[OpenRouter Fallback] Error processing OpenRouter success response: {e}", exc_info=True); return None, "Sorry, error processing fallback AI response."
|
694 |
+
elif response.status_code == 401: logger.error("[OpenRouter Fallback] API key invalid (401)."); return None, f"Error: Fallback AI model ({OPENROUTER_DEEPSEEK_MODEL}) configuration key is invalid."
|
695 |
+
elif response.status_code == 402: logger.error("[OpenRouter Fallback] Payment Required/Quota Exceeded (402)."); return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) quota/limit issue."
|
696 |
+
elif response.status_code == 429: logger.warning("[OpenRouter Fallback] Rate Limit Exceeded (429)."); return None, f"Sorry, fallback AI model ({OPENROUTER_DEEPSEEK_MODEL}) is busy. Try again."
|
697 |
+
elif response.status_code == 500: logger.error(f"[OpenRouter Fallback] Internal Server Error (500). Resp:{response.text[:500]}"); return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) had an internal error."
|
698 |
else:
|
699 |
error_info = "";
|
700 |
try: error_info = response.json().get("error", {}).get("message", "")
|
701 |
except Exception: pass
|
702 |
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}");
|
703 |
+
return None, f"Sorry, fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) returned unexpected status ({response.status_code})."
|
704 |
|
705 |
+
except httpx.TimeoutException as e: logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting/reading from OpenRouter API: {e}"); return None, f"Sorry, the fallback AI service ({OPENROUTER_DEEPSEEK_MODEL}) timed out."
|
706 |
except httpx.RequestError as e: logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}"); return None, "Sorry, there was an error connecting to the fallback AI model service."
|
707 |
except Exception as e: logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True); return None, "Sorry, an unexpected error occurred while using the fallback AI service."
|
708 |
|
709 |
|
|
|
710 |
async def generate_summary(text: str, summary_type: str) -> str:
|
711 |
"""
|
712 |
+
Generates summary using the specific model hierarchy (April 2025):
|
713 |
+
1. Groq (Llama 4 Scout)
|
714 |
+
2. Gemini (2.5 Pro Exp)
|
715 |
3. Gemini (2.0 Flash)
|
716 |
+
4. OpenRouter (DeepSeek V3 Free)
|
717 |
Returns the summary text or a comprehensive error message.
|
718 |
"""
|
719 |
global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
720 |
+
global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL
|
721 |
|
722 |
+
logger.info("[Summary Generation] Starting process with specific April 2025 model hierarchy.")
|
723 |
summary: Optional[str] = None
|
724 |
+
errors: Dict[str, Optional[str]] = {
|
725 |
+
"Llama4Scout": None, # <<< Use more descriptive keys
|
726 |
+
"GeminiProExp": None,
|
727 |
"GeminiFlash": None,
|
728 |
+
"DeepSeekV3": None,
|
729 |
}
|
730 |
|
731 |
+
# --- Attempt 1: Groq (Llama 4 Scout) ---
|
732 |
if _groq_enabled:
|
733 |
+
logger.info(f"[Summary Generation] Attempting 1: Groq ({GROQ_LLAMA4_MODEL})")
|
734 |
+
summary, errors["Llama4Scout"] = await _call_groq(text, summary_type)
|
735 |
if summary:
|
736 |
+
logger.info(f"[Summary Generation] Success with Groq ({GROQ_LLAMA4_MODEL}).")
|
737 |
return summary
|
738 |
else:
|
739 |
+
logger.warning(f"[Summary Generation] Groq Llama 4 Scout failed. Error: {errors['Llama4Scout']}. Proceeding to Gemini 2.5 Pro Exp.")
|
740 |
else:
|
741 |
+
logger.warning("[Summary Generation] Groq (Llama 4 Scout) is disabled or unavailable. Skipping.")
|
742 |
+
errors["Llama4Scout"] = "Service disabled/unavailable."
|
743 |
|
744 |
+
# --- Attempt 2: Gemini 2.5 Pro Exp ---
|
745 |
if _gemini_api_enabled:
|
746 |
+
logger.info(f"[Summary Generation] Attempting 2: Gemini ({GEMINI_PRO_EXP_MODEL})")
|
747 |
+
summary, errors["GeminiProExp"] = await _call_gemini(text, summary_type, GEMINI_PRO_EXP_MODEL)
|
748 |
if summary:
|
749 |
+
logger.info(f"[Summary Generation] Success with Gemini ({GEMINI_PRO_EXP_MODEL}).")
|
750 |
return summary
|
751 |
else:
|
752 |
+
logger.warning(f"[Summary Generation] Gemini 2.5 Pro Exp failed. Error: {errors['GeminiProExp']}. Proceeding to Gemini 2.0 Flash.")
|
753 |
else:
|
754 |
+
logger.warning("[Summary Generation] Gemini API is disabled or unavailable. Skipping Gemini 2.5 Pro Exp & 2.0 Flash.")
|
755 |
+
errors["GeminiProExp"] = "Service disabled/unavailable."
|
756 |
+
errors["GeminiFlash"] = "Service disabled/unavailable."
|
757 |
|
758 |
# --- Attempt 3: Gemini 2.0 Flash ---
|
759 |
+
if _gemini_api_enabled and errors["GeminiFlash"] is None:
|
760 |
logger.info(f"[Summary Generation] Attempting 3: Gemini ({GEMINI_FLASH_MODEL})")
|
761 |
summary, errors["GeminiFlash"] = await _call_gemini(text, summary_type, GEMINI_FLASH_MODEL)
|
762 |
if summary:
|
763 |
logger.info(f"[Summary Generation] Success with Gemini ({GEMINI_FLASH_MODEL}).")
|
764 |
return summary
|
765 |
else:
|
766 |
+
logger.warning(f"[Summary Generation] Gemini 2.0 Flash failed. Error: {errors['GeminiFlash']}. Proceeding to OpenRouter DeepSeek V3.")
|
767 |
+
elif errors["GeminiFlash"] is None:
|
768 |
+
logger.warning("[Summary Generation] Skipping Gemini 2.0 Flash (API was disabled).")
|
769 |
errors["GeminiFlash"] = "Service disabled/unavailable."
|
770 |
|
771 |
+
# --- Attempt 4: OpenRouter (DeepSeek V3 Free - Final Fallback) ---
|
|
|
772 |
if _openrouter_fallback_enabled:
|
773 |
+
logger.info(f"[Summary Generation] Attempting 4: OpenRouter ({OPENROUTER_DEEPSEEK_MODEL})")
|
774 |
+
summary, errors["DeepSeekV3"] = await _call_openrouter(text, summary_type)
|
775 |
if summary:
|
776 |
+
logger.info(f"[Summary Generation] Success with OpenRouter ({OPENROUTER_DEEPSEEK_MODEL}).")
|
777 |
return summary
|
778 |
else:
|
779 |
+
logger.error(f"[Summary Generation] OpenRouter DeepSeek V3 (Final Fallback) also failed. Error: {errors['DeepSeekV3']}")
|
780 |
else:
|
781 |
+
logger.error("[Summary Generation] OpenRouter fallback (DeepSeek V3) is disabled or unavailable. Cannot proceed.")
|
782 |
+
errors["DeepSeekV3"] = "Service disabled/unavailable."
|
783 |
|
784 |
# --- All Attempts Failed ---
|
785 |
logger.error("[Summary Generation] All summarization models failed.")
|
|
|
786 |
error_details = "\n".join([f"- {model}: {err}" for model, err in errors.items() if err])
|
787 |
return f"Sorry, I couldn't generate a summary after trying all available AI models.\nDetails:\n{error_details}"
|
788 |
|
789 |
|
790 |
# --- Main Processing Logic ---
|
791 |
+
# (process_summary_task remains UNCHANGED in its core logic, it correctly calls the updated generate_summary.
|
792 |
+
# Omitted here for brevity, but MUST be included in the final file.)
|
|
|
793 |
# --- START OMITTED process_summary_task ---
|
794 |
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
|
795 |
"""Handles the entire process: fetching content (with ALL fallbacks) and summarizing."""
|
|
|
828 |
else: user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
|
829 |
if not content and not user_feedback_message: user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
|
830 |
else:
|
831 |
+
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN, _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists
|
832 |
|
833 |
logger.info(f"[Task {task_id}] Trying Web Scrape Method 1 (Direct Fetch + BS4)..."); content = await get_website_content(url)
|
834 |
if not content:
|
835 |
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
|
836 |
+
if _urltotext_key_exists: await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing'); content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
|
837 |
else: logger.warning("[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
|
838 |
if not content:
|
839 |
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
|
|
|
863 |
except Exception as edit_e: logger.warning(f"[Task {task_id}] Failed to edit status message before summary: {edit_e}")
|
864 |
|
865 |
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
|
866 |
+
final_summary = await generate_summary(content, summary_type) # Calls the updated function
|
|
|
867 |
|
868 |
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
|
869 |
user_feedback_message = final_summary
|
|
|
908 |
|
909 |
# --- Telegram Handlers ---
|
910 |
# (start, help_command, handle_potential_url, handle_summary_type_callback, error_handler
|
911 |
+
# remain UNCHANGED. Omitted here for brevity, but include in final file.)
|
|
|
912 |
# --- START OMITTED TELEGRAM HANDLERS ---
|
913 |
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
914 |
user = update.effective_user; mention = user.mention_html()
|
|
|
925 |
"2. I'll ask how you want it summarised (paragraph or points).\n"
|
926 |
"3. Click the button for your choice.\n"
|
927 |
"4. Wait while I fetch the content and generate the summary!\n\n"
|
928 |
+
"βοΈ I try multiple methods to get content, especially for tricky websites or YouTube videos without standard transcripts. I then use a sequence of AI models (Llama 4 Scout, Gemini 2.5 Pro, Gemini 2.0 Flash, DeepSeek V3) to summarise.\n\n" # Updated help text
|
929 |
"**Commands:**\n"
|
930 |
"`/start` - Display the welcome message\n"
|
931 |
"`/help` - Show this help message" )
|
|
|
983 |
|
984 |
context.user_data.pop('url_to_summarize', None); context.user_data.pop('original_message_id', None); logger.debug(f"Cleared URL context for user {user.id}")
|
985 |
|
986 |
+
# Check essential configurations - requires at least ONE summarizer to be enabled
|
987 |
global TELEGRAM_TOKEN, _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
988 |
if not TELEGRAM_TOKEN:
|
989 |
logger.critical("TELEGRAM_TOKEN missing in callback!")
|
|
|
995 |
try: await query.edit_message_text(text="β AI configuration error: No summarization models available.")
|
996 |
except Exception: pass
|
997 |
return
|
998 |
+
# Log warnings if specific models/APIs are unavailable but don't stop the process if fallbacks exist
|
999 |
+
if not _groq_enabled: logger.warning("Primary AI (Groq Llama 4 Scout) is unavailable.")
|
1000 |
+
if not _gemini_api_enabled: logger.warning("Gemini API is unavailable (skipping 2.5 Pro Exp & 2.0 Flash).")
|
1001 |
+
if not _openrouter_fallback_enabled: logger.warning("Final Fallback AI (OpenRouter DeepSeek V3) is unavailable.")
|
|
|
|
|
|
|
|
|
|
|
|
|
1002 |
|
1003 |
logger.info(f"Scheduling background task for user {user.id}, chat {query.message.chat_id}, msg {message_id_to_edit}")
|
1004 |
asyncio.create_task(
|
|
|
1020 |
|
1021 |
|
1022 |
# --- Application Setup & Web Framework ---
|
1023 |
+
# (setup_bot_config, lifespan, telegram_webhook, app definition
|
1024 |
+
# remain UNCHANGED. health_check is modified below.
|
1025 |
# Omitted here for brevity, include in final file.)
|
1026 |
+
# --- START OMITTED APP SETUP/WEB FRAMEWORK (excluding health_check) ---
|
1027 |
async def setup_bot_config() -> Application:
|
1028 |
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
|
1029 |
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
|
|
|
1093 |
else: logger.info("PTB application was not fully initialized or failed during startup. No shutdown actions needed.")
|
1094 |
logger.info("ASGI Lifespan: Shutdown complete.")
|
1095 |
|
1096 |
+
async def telegram_webhook(request: Request) -> Response:
|
1097 |
+
"""Handles incoming updates from Telegram."""
|
1098 |
+
global WEBHOOK_SECRET
|
1099 |
+
if not ptb_app: logger.error("Webhook received but PTB application not initialized."); return PlainTextResponse('Bot not initialized', status_code=503)
|
1100 |
+
if not ptb_app.running: logger.warning("Webhook received but PTB application not running."); return PlainTextResponse('Bot not running, cannot process update', status_code=503)
|
1101 |
+
if WEBHOOK_SECRET:
|
1102 |
+
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
|
1103 |
+
if token_header != WEBHOOK_SECRET: logger.warning(f"Webhook received with invalid secret token. Header: '{token_header}'"); return Response(content="Invalid secret token", status_code=403)
|
1104 |
+
try:
|
1105 |
+
update_data = await request.json(); update = Update.de_json(data=update_data, bot=ptb_app.bot)
|
1106 |
+
logger.debug(f"Processing update_id: {update.update_id} via webhook"); await ptb_app.process_update(update)
|
1107 |
+
return Response(status_code=200)
|
1108 |
+
except json.JSONDecodeError: logger.error("Webhook received invalid JSON."); return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
|
1109 |
+
except Exception as e: logger.error(f"Error processing webhook update: {e}", exc_info=True); return Response(status_code=200)
|
1110 |
+
|
1111 |
+
# --- Starlette App Definition ---
|
1112 |
+
# Note: health_check is defined below
|
1113 |
+
app = Starlette(
|
1114 |
+
debug=False,
|
1115 |
+
lifespan=lifespan,
|
1116 |
+
routes=[ Route("/", endpoint=health_check, methods=["GET"]), Route("/webhook", endpoint=telegram_webhook, methods=["POST"]), ]
|
1117 |
+
)
|
1118 |
+
logger.info("Starlette ASGI application created with health check and webhook routes.")
|
1119 |
+
# --- END OMITTED APP SETUP/WEB FRAMEWORK ---
|
1120 |
+
|
1121 |
+
# <<< CHANGE: Updated health check response for specific models >>>
|
1122 |
async def health_check(request: Request) -> PlainTextResponse:
|
1123 |
"""Simple health check endpoint."""
|
1124 |
+
global GROQ_LLAMA4_MODEL, GEMINI_PRO_EXP_MODEL, GEMINI_FLASH_MODEL, OPENROUTER_DEEPSEEK_MODEL, APIFY_ACTOR_ID
|
|
|
1125 |
global _groq_enabled, _gemini_api_enabled, _openrouter_fallback_enabled
|
1126 |
global _apify_token_exists, _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY
|
1127 |
|
|
|
1139 |
bot_status = f"Error checking status: {type(e).__name__}"; logger.warning(f"Health check: Error getting bot info: {e}")
|
1140 |
else: bot_status = "Not Initialized"; bot_username = "N/A"
|
1141 |
|
1142 |
+
# <<< Update response string with specific model names >>>
|
1143 |
return PlainTextResponse(
|
1144 |
f"TG Bot Summariser - Status: {bot_status} ({bot_username})\n"
|
1145 |
f"---\n"
|
1146 |
+
f"Summarizer Priority (April 2025 - Specific):\n"
|
1147 |
+
f"1. Groq API: {GROQ_LLAMA4_MODEL if _groq_enabled else 'DISABLED'}\n"
|
1148 |
+
f"2. Gemini API: {GEMINI_PRO_EXP_MODEL if _gemini_api_enabled else 'DISABLED'}\n"
|
1149 |
+
f"3. Gemini API: {GEMINI_FLASH_MODEL if _gemini_api_enabled else 'DISABLED'}\n"
|
1150 |
+
f"4. OpenRouter API: {OPENROUTER_DEEPSEEK_MODEL if _openrouter_fallback_enabled else 'DISABLED'}\n"
|
1151 |
f"---\n"
|
1152 |
f"Content Fetching Status:\n"
|
1153 |
f"YT Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled'}\n"
|
|
|
1158 |
f"Web Scrape 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled'}"
|
1159 |
)
|
1160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1161 |
# --- Development Server (if run directly) ---
|
1162 |
if __name__ == '__main__':
|
1163 |
import uvicorn
|