Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on 26 days ago

Commit

275be65

verified ·

1 Parent(s): 1fb5a74

Update main.py

Browse files

Files changed (1) hide show

main.py +37 -11

main.py CHANGED Viewed

@@ -361,30 +361,56 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
     except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
 async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
-    """Fallback 2: Fetches website content using Scraper's Proxy Parser via RapidAPI."""
     if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
     if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
-    logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
     api_host = "scrapers-proxy2.p.rapidapi.com"
     encoded_url = urllib.parse.quote(url, safe='')
-    api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
-    headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
     try:
         async with httpx.AsyncClient(timeout=40.0) as client:
-            logger.debug(f"[Web Scrape Fallback 2] Sending GET request to {api_host} for {url}")
-            response = await client.get(api_endpoint, headers=headers)
             logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}")
             if response.status_code == 200:
                 try:
                     data = response.json()
-                    content = data.get("content"); title = data.get("title")
                     extracted_text = "";
                     if title and isinstance(title, str): extracted_text += title.strip() + ". "
                     if content and isinstance(content, str): extracted_text += content.strip()
-                    if extracted_text and len(extracted_text) > 30: logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Parser API for {url}. Len: {len(extracted_text)}"); return extracted_text
-                    else: logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy API success but content/title seems empty or too short for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}"); return None
-                except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
-                except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
             elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None
             elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
             elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None

     except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
 async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
+    """Fallback 2: Fetches website content using Scraper's Proxy Standard endpoint via RapidAPI (Updated)."""
     if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
     if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
+    logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Standard API (POST)")
     api_host = "scrapers-proxy2.p.rapidapi.com"
+    # --- Updated Endpoint Construction ---
+    # Uses POST method and /standard path
+    # Query parameters: url and content_type=application/json
     encoded_url = urllib.parse.quote(url, safe='')
+    content_type_param = urllib.parse.quote('application/json', safe='')
+    api_endpoint = f"https://{api_host}/standard?url={encoded_url}&content_type={content_type_param}"
+    # --- Updated Headers ---
+    headers = {
+        "Content-Type": "application/json", # Added as per cURL example
+        "x-rapidapi-host": api_host,
+        "x-rapidapi-key": api_key,
+        # 'accept-encoding': 'gzip' removed, httpx handles this automatically
+    }
+    # --- Define Payload (empty JSON object as per cURL example, might not be strictly needed but safer) ---
+    payload = {} # Use {} instead of the example string for a generic POST
     try:
         async with httpx.AsyncClient(timeout=40.0) as client:
+            logger.debug(f"[Web Scrape Fallback 2] Sending POST request to {api_host}/standard for {url}") # Changed to POST
+            # --- Use client.post with json payload ---
+            response = await client.post(api_endpoint, headers=headers, json=payload)
             logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}")
+            # --- Response Handling (Kept similar parsing, adjust if /standard format differs) ---
             if response.status_code == 200:
                 try:
                     data = response.json()
+                    # Assuming /standard endpoint might still have 'content' and 'title' or similar structure
+                    content = data.get("content") or data.get("text") # Added .get("text") as a potential alternative
+                    title = data.get("title")
                     extracted_text = "";
                     if title and isinstance(title, str): extracted_text += title.strip() + ". "
                     if content and isinstance(content, str): extracted_text += content.strip()
+                    if extracted_text and len(extracted_text) > 30:
+                         logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Standard API for {url}. Len: {len(extracted_text)}"); return extracted_text
+                    else:
+                         # Log if parsing failed even on 200
+                         keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else "Non-dict response"
+                         logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy Standard API success (200) but content/title seems empty or too short for {url}. {keys_info}. Length: {len(extracted_text)}"); return None
+                except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy Standard API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
+                except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy Standard API success response for {url}: {e}", exc_info=True); return None
+            # --- Error Handling (remains the same) ---
             elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None
             elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
             elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None