Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -446,9 +446,17 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
446 |
logger.debug(f"[{actor_name}] POST Request to {sync_items_endpoint} for {url}")
|
447 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
448 |
logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
|
449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
try:
|
451 |
results = response.json(); content = None
|
|
|
452 |
if isinstance(results, list) and len(results) > 0:
|
453 |
item = results[0]
|
454 |
if "text" in item and isinstance(item["text"], str): content = item["text"]
|
@@ -458,17 +466,33 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
458 |
logger.warning(f"[{actor_name}] No 'text' or 'markdown' found, attempting to parse 'html' from result.")
|
459 |
soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
|
460 |
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
461 |
-
|
|
|
|
|
|
|
|
|
462 |
else:
|
463 |
content_len = len(content) if content and isinstance(content, str) else 0
|
464 |
-
logger.warning(f"[{actor_name}] Dataset item parsed but text content empty/short/invalid format for {url}. Item keys: {list(item.keys())}. Length: {content_len}")
|
465 |
-
|
466 |
-
|
467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
elif response.status_code == 400: logger.error(f"[{actor_name}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
|
469 |
elif response.status_code == 401: logger.error(f"[{actor_name}] Auth error (401). Check token."); return None
|
470 |
elif response.status_code == 404: logger.error(f"[{actor_name}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
|
471 |
-
else:
|
|
|
|
|
472 |
except httpx.TimeoutException as e: logger.error(f"[{actor_name}] Timeout during API interaction for {url}: {e}"); return None
|
473 |
except httpx.HTTPStatusError as e: logger.error(f"[{actor_name}] HTTP Status Error during API interaction for {url}: {e}"); return None
|
474 |
except httpx.RequestError as e: logger.error(f"[{actor_name}] Request error during API interaction for {url}: {e}"); return None
|
|
|
446 |
logger.debug(f"[{actor_name}] POST Request to {sync_items_endpoint} for {url}")
|
447 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
448 |
logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
|
449 |
+
logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
|
450 |
+
|
451 |
+
# <<< MODIFIED check for success: Accept 200 OR 201 >>>
|
452 |
+
if response.status_code in [200, 201]:
|
453 |
+
# Log if it was 201, as it's slightly unexpected for sync but apparently happens
|
454 |
+
if response.status_code == 201:
|
455 |
+
logger.info(f"[{actor_name}] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
456 |
+
|
457 |
try:
|
458 |
results = response.json(); content = None
|
459 |
+
# --- The rest of the success processing logic stays the same ---
|
460 |
if isinstance(results, list) and len(results) > 0:
|
461 |
item = results[0]
|
462 |
if "text" in item and isinstance(item["text"], str): content = item["text"]
|
|
|
466 |
logger.warning(f"[{actor_name}] No 'text' or 'markdown' found, attempting to parse 'html' from result.")
|
467 |
soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
|
468 |
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
469 |
+
|
470 |
+
# Check content validity AFTER parsing
|
471 |
+
if content and isinstance(content, str) and len(content) > 30:
|
472 |
+
logger.info(f"[{actor_name}] Success via REST (Status {response.status_code}) for {url}. Length: {len(content)}")
|
473 |
+
return content.strip()
|
474 |
else:
|
475 |
content_len = len(content) if content and isinstance(content, str) else 0
|
476 |
+
logger.warning(f"[{actor_name}] Dataset item parsed (Status {response.status_code}) but text content empty/short/invalid format for {url}. Item keys: {list(item.keys())}. Length: {content_len}")
|
477 |
+
return None # Return None if content is bad, even if API status was 200/201
|
478 |
+
else:
|
479 |
+
# Handle empty dataset list '[]' which might be returned with 200 or 201
|
480 |
+
logger.warning(f"[{actor_name}] Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
|
481 |
+
return None
|
482 |
+
# --- End of success processing logic ---
|
483 |
+
except json.JSONDecodeError:
|
484 |
+
logger.error(f"[{actor_name}] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
|
485 |
+
return None
|
486 |
+
except Exception as e:
|
487 |
+
logger.error(f"[{actor_name}] Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
|
488 |
+
return None
|
489 |
+
# <<< The existing elif error handling remains the same >>>
|
490 |
elif response.status_code == 400: logger.error(f"[{actor_name}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
|
491 |
elif response.status_code == 401: logger.error(f"[{actor_name}] Auth error (401). Check token."); return None
|
492 |
elif response.status_code == 404: logger.error(f"[{actor_name}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
|
493 |
+
else: # Catches any other non-200/201 status
|
494 |
+
logger.error(f"[{actor_name}] Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}");
|
495 |
+
return None
|
496 |
except httpx.TimeoutException as e: logger.error(f"[{actor_name}] Timeout during API interaction for {url}: {e}"); return None
|
497 |
except httpx.HTTPStatusError as e: logger.error(f"[{actor_name}] HTTP Status Error during API interaction for {url}: {e}"); return None
|
498 |
except httpx.RequestError as e: logger.error(f"[{actor_name}] Request error during API interaction for {url}: {e}"); return None
|