Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -491,47 +491,64 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
491 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
492 |
logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
|
493 |
|
494 |
-
# <<< MODIFIED check for success: Accept 200 OR 201 >>>
|
495 |
if response.status_code in [200, 201]:
|
496 |
-
# Log if it was 201, as it's slightly unexpected for sync but apparently happens
|
497 |
if response.status_code == 201:
|
498 |
logger.info(f"[{actor_name}] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
499 |
|
500 |
try:
|
501 |
results = response.json(); content = None
|
502 |
-
#
|
503 |
if isinstance(results, list) and len(results) > 0:
|
504 |
item = results[0]
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
533 |
else:
|
534 |
-
# Handle empty dataset list '[]'
|
535 |
logger.warning(f"[{actor_name}] Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
|
536 |
return None
|
537 |
# --- End of success processing logic ---
|
|
|
491 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
492 |
logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
|
493 |
|
|
|
494 |
if response.status_code in [200, 201]:
|
|
|
495 |
if response.status_code == 201:
|
496 |
logger.info(f"[{actor_name}] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
497 |
|
498 |
try:
|
499 |
results = response.json(); content = None
|
500 |
+
# Check if results is a list and not empty
|
501 |
if isinstance(results, list) and len(results) > 0:
|
502 |
item = results[0]
|
503 |
+
content = None # Reset content
|
504 |
+
|
505 |
+
# Parsing Logic - try extracting text from various possible keys
|
506 |
+
if "text" in item and isinstance(item["text"], str):
|
507 |
+
content = item["text"]
|
508 |
+
elif "content" in item and isinstance(item["content"], str):
|
509 |
+
content = item["content"]
|
510 |
+
elif "markdown" in item and isinstance(item["markdown"], str):
|
511 |
+
content = item["markdown"]
|
512 |
+
elif "captions" in item and isinstance(item["captions"], str):
|
513 |
+
# Specifically check if this is the YT actor by name or ID if needed,
|
514 |
+
# otherwise this assumes 'captions' might contain text for other actors too.
|
515 |
+
is_yt_actor = actor_id == APIFY_ACTOR_ID # Check if it's the specific YT actor
|
516 |
+
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
517 |
+
logger.info(f"{log_prefix} Found text content in 'captions' field.")
|
518 |
+
content = item["captions"]
|
519 |
+
elif "html" in item and isinstance(item["html"], str):
|
520 |
+
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
521 |
+
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
522 |
+
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
523 |
+
# Run potentially blocking BS4 parsing in a separate thread
|
524 |
+
def parse_html_sync(html_str):
|
525 |
+
try:
|
526 |
+
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
527 |
+
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
528 |
+
except Exception as e:
|
529 |
+
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
530 |
+
return None
|
531 |
+
content = await asyncio.to_thread(parse_html_sync, item["html"])
|
532 |
+
|
533 |
+
# Check content validity AFTER attempting all parsing methods
|
534 |
+
if content and isinstance(content, str) and len(content) > 30:
|
535 |
+
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
536 |
+
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
537 |
+
# Use 'url' which is the function parameter
|
538 |
+
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Length: {len(content)}")
|
539 |
+
return content.strip()
|
540 |
+
else:
|
541 |
+
# Content is invalid or parsing failed
|
542 |
+
content_len = len(content) if content and isinstance(content, str) else 0
|
543 |
+
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
544 |
+
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
545 |
+
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
546 |
+
# Use 'url' which is the function parameter
|
547 |
+
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
548 |
+
return None
|
549 |
+
# <<< CORRECTED INDENTATION FOR THIS ELSE >>>
|
550 |
else:
|
551 |
+
# Handle empty dataset list '[]'
|
552 |
logger.warning(f"[{actor_name}] Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
|
553 |
return None
|
554 |
# --- End of success processing logic ---
|