Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -361,30 +361,56 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
|
|
361 |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
362 |
|
363 |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
|
364 |
-
"""Fallback 2: Fetches website content using Scraper's Proxy
|
365 |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
|
366 |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
|
367 |
-
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy
|
368 |
api_host = "scrapers-proxy2.p.rapidapi.com"
|
|
|
|
|
|
|
369 |
encoded_url = urllib.parse.quote(url, safe='')
|
370 |
-
|
371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
try:
|
373 |
async with httpx.AsyncClient(timeout=40.0) as client:
|
374 |
-
logger.debug(f"[Web Scrape Fallback 2] Sending
|
375 |
-
|
|
|
376 |
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}")
|
|
|
|
|
377 |
if response.status_code == 200:
|
378 |
try:
|
379 |
data = response.json()
|
380 |
-
|
|
|
|
|
381 |
extracted_text = "";
|
382 |
if title and isinstance(title, str): extracted_text += title.strip() + ". "
|
383 |
if content and isinstance(content, str): extracted_text += content.strip()
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
389 |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
|
390 |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None
|
|
|
361 |
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
|
362 |
|
363 |
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
|
364 |
+
"""Fallback 2: Fetches website content using Scraper's Proxy Standard endpoint via RapidAPI (Updated)."""
|
365 |
if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
|
366 |
if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
|
367 |
+
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Standard API (POST)")
|
368 |
api_host = "scrapers-proxy2.p.rapidapi.com"
|
369 |
+
# --- Updated Endpoint Construction ---
|
370 |
+
# Uses POST method and /standard path
|
371 |
+
# Query parameters: url and content_type=application/json
|
372 |
encoded_url = urllib.parse.quote(url, safe='')
|
373 |
+
content_type_param = urllib.parse.quote('application/json', safe='')
|
374 |
+
api_endpoint = f"https://{api_host}/standard?url={encoded_url}&content_type={content_type_param}"
|
375 |
+
|
376 |
+
# --- Updated Headers ---
|
377 |
+
headers = {
|
378 |
+
"Content-Type": "application/json", # Added as per cURL example
|
379 |
+
"x-rapidapi-host": api_host,
|
380 |
+
"x-rapidapi-key": api_key,
|
381 |
+
# 'accept-encoding': 'gzip' removed, httpx handles this automatically
|
382 |
+
}
|
383 |
+
# --- Define Payload (empty JSON object as per cURL example, might not be strictly needed but safer) ---
|
384 |
+
payload = {} # Use {} instead of the example string for a generic POST
|
385 |
+
|
386 |
try:
|
387 |
async with httpx.AsyncClient(timeout=40.0) as client:
|
388 |
+
logger.debug(f"[Web Scrape Fallback 2] Sending POST request to {api_host}/standard for {url}") # Changed to POST
|
389 |
+
# --- Use client.post with json payload ---
|
390 |
+
response = await client.post(api_endpoint, headers=headers, json=payload)
|
391 |
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}")
|
392 |
+
|
393 |
+
# --- Response Handling (Kept similar parsing, adjust if /standard format differs) ---
|
394 |
if response.status_code == 200:
|
395 |
try:
|
396 |
data = response.json()
|
397 |
+
# Assuming /standard endpoint might still have 'content' and 'title' or similar structure
|
398 |
+
content = data.get("content") or data.get("text") # Added .get("text") as a potential alternative
|
399 |
+
title = data.get("title")
|
400 |
extracted_text = "";
|
401 |
if title and isinstance(title, str): extracted_text += title.strip() + ". "
|
402 |
if content and isinstance(content, str): extracted_text += content.strip()
|
403 |
+
|
404 |
+
if extracted_text and len(extracted_text) > 30:
|
405 |
+
logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Standard API for {url}. Len: {len(extracted_text)}"); return extracted_text
|
406 |
+
else:
|
407 |
+
# Log if parsing failed even on 200
|
408 |
+
keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else "Non-dict response"
|
409 |
+
logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy Standard API success (200) but content/title seems empty or too short for {url}. {keys_info}. Length: {len(extracted_text)}"); return None
|
410 |
+
|
411 |
+
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy Standard API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
|
412 |
+
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy Standard API success response for {url}: {e}", exc_info=True); return None
|
413 |
+
# --- Error Handling (remains the same) ---
|
414 |
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None
|
415 |
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
|
416 |
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None
|