fmab777 commited on
Commit
275be65
·
verified ·
1 Parent(s): 1fb5a74

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +37 -11
main.py CHANGED
@@ -361,30 +361,56 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
361
  except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
362
 
363
  async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
364
- """Fallback 2: Fetches website content using Scraper's Proxy Parser via RapidAPI."""
365
  if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
366
  if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
367
- logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Parser API")
368
  api_host = "scrapers-proxy2.p.rapidapi.com"
 
 
 
369
  encoded_url = urllib.parse.quote(url, safe='')
370
- api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
371
- headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
 
 
 
 
 
 
 
 
 
 
 
372
  try:
373
  async with httpx.AsyncClient(timeout=40.0) as client:
374
- logger.debug(f"[Web Scrape Fallback 2] Sending GET request to {api_host} for {url}")
375
- response = await client.get(api_endpoint, headers=headers)
 
376
  logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}")
 
 
377
  if response.status_code == 200:
378
  try:
379
  data = response.json()
380
- content = data.get("content"); title = data.get("title")
 
 
381
  extracted_text = "";
382
  if title and isinstance(title, str): extracted_text += title.strip() + ". "
383
  if content and isinstance(content, str): extracted_text += content.strip()
384
- if extracted_text and len(extracted_text) > 30: logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Parser API for {url}. Len: {len(extracted_text)}"); return extracted_text
385
- else: logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy API success but content/title seems empty or too short for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}"); return None
386
- except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
387
- except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
 
 
 
 
 
 
 
388
  elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None
389
  elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
390
  elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None
 
361
  except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
362
 
363
  async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
364
+ """Fallback 2: Fetches website content using Scraper's Proxy Standard endpoint via RapidAPI (Updated)."""
365
  if not url: logger.error("[Web Scrape Fallback 2] No URL provided"); return None
366
  if not api_key: logger.error("[Web Scrape Fallback 2] RapidAPI key missing."); return None
367
+ logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using Scraper's Proxy Standard API (POST)")
368
  api_host = "scrapers-proxy2.p.rapidapi.com"
369
+ # --- Updated Endpoint Construction ---
370
+ # Uses POST method and /standard path
371
+ # Query parameters: url and content_type=application/json
372
  encoded_url = urllib.parse.quote(url, safe='')
373
+ content_type_param = urllib.parse.quote('application/json', safe='')
374
+ api_endpoint = f"https://{api_host}/standard?url={encoded_url}&content_type={content_type_param}"
375
+
376
+ # --- Updated Headers ---
377
+ headers = {
378
+ "Content-Type": "application/json", # Added as per cURL example
379
+ "x-rapidapi-host": api_host,
380
+ "x-rapidapi-key": api_key,
381
+ # 'accept-encoding': 'gzip' removed, httpx handles this automatically
382
+ }
383
+ # --- Define Payload (empty JSON object as per cURL example, might not be strictly needed but safer) ---
384
+ payload = {} # Use {} instead of the example string for a generic POST
385
+
386
  try:
387
  async with httpx.AsyncClient(timeout=40.0) as client:
388
+ logger.debug(f"[Web Scrape Fallback 2] Sending POST request to {api_host}/standard for {url}") # Changed to POST
389
+ # --- Use client.post with json payload ---
390
+ response = await client.post(api_endpoint, headers=headers, json=payload)
391
  logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from {api_host} for {url}")
392
+
393
+ # --- Response Handling (Kept similar parsing, adjust if /standard format differs) ---
394
  if response.status_code == 200:
395
  try:
396
  data = response.json()
397
+ # Assuming /standard endpoint might still have 'content' and 'title' or similar structure
398
+ content = data.get("content") or data.get("text") # Added .get("text") as a potential alternative
399
+ title = data.get("title")
400
  extracted_text = "";
401
  if title and isinstance(title, str): extracted_text += title.strip() + ". "
402
  if content and isinstance(content, str): extracted_text += content.strip()
403
+
404
+ if extracted_text and len(extracted_text) > 30:
405
+ logger.info(f"[Web Scrape Fallback 2] Success via Scraper's Proxy Standard API for {url}. Len: {len(extracted_text)}"); return extracted_text
406
+ else:
407
+ # Log if parsing failed even on 200
408
+ keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else "Non-dict response"
409
+ logger.warning(f"[Web Scrape Fallback 2] Scraper's Proxy Standard API success (200) but content/title seems empty or too short for {url}. {keys_info}. Length: {len(extracted_text)}"); return None
410
+
411
+ except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode Scraper's Proxy Standard API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
412
+ except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing Scraper's Proxy Standard API success response for {url}: {e}", exc_info=True); return None
413
+ # --- Error Handling (remains the same) ---
414
  elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 2] Auth error (401) with {api_host}. Check RapidAPI key."); return None
415
  elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 2] Forbidden (403) from {api_host}. Check API subscription/limits."); return None
416
  elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 2] Rate Limit (429) from {api_host}."); return None