Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -564,9 +564,9 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
564 |
run_input: Dict[str, Any] = {} # Initialize empty dict
|
565 |
|
566 |
if actor_id == APIFY_ACTOR_ID:
|
567 |
-
# Input specific to the default YT actor (1s7eXiaukVuOr4Ueg)
|
568 |
run_input = {
|
569 |
-
"urls": [url],
|
570 |
"maxRetries": 5,
|
571 |
"channelHandleBoolean": False,
|
572 |
"channelNameBoolean": False,
|
@@ -574,40 +574,38 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
574 |
"relativeDateTextBoolean": False
|
575 |
}
|
576 |
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
|
577 |
-
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< ---
|
578 |
-
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR)
|
579 |
-
# Based on the error message "Field input.urls
|
580 |
run_input = {
|
581 |
-
"urls":
|
582 |
-
# Add other specific parameters for this actor if needed/known
|
583 |
}
|
584 |
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
|
585 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
586 |
-
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR)
|
587 |
run_input = {
|
588 |
-
"urls": [url]
|
589 |
}
|
590 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
591 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
592 |
-
# Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS)
|
593 |
run_input = {
|
594 |
-
"startUrls": [{"url": url}],
|
595 |
"maxCrawlPages": 1,
|
596 |
"crawlerType": "playwright:firefox" # Or adjust as needed
|
597 |
-
# Add other parameters specific to the crawler if necessary
|
598 |
}
|
599 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
600 |
else:
|
601 |
# Fallback default input if actor ID doesn't match known ones
|
602 |
# Using the simple {"urls": [url]} format seems safest for generic text/content extractors
|
603 |
-
run_input = {"urls": [url]}
|
604 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
605 |
|
606 |
|
607 |
headers = {"Content-Type": "application/json"}
|
608 |
try:
|
609 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
610 |
-
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {run_input}") # Log the input being sent
|
611 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
612 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
613 |
|
@@ -624,7 +622,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
624 |
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
625 |
content = None # Reset content
|
626 |
|
627 |
-
# --- REFINED PARSING LOGIC ---
|
628 |
if "text" in item and isinstance(item["text"], str):
|
629 |
logger.info(f"{log_prefix} Found text content in 'text' field.")
|
630 |
content = item["text"]
|
@@ -691,13 +689,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
691 |
return None
|
692 |
# --- End of success processing logic ---
|
693 |
except json.JSONDecodeError:
|
694 |
-
|
695 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
696 |
except Exception as e:
|
697 |
logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
|
698 |
return None
|
699 |
# Error handling for API call itself
|
700 |
-
elif response.status_code == 400:
|
|
|
|
|
|
|
|
|
|
|
701 |
elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
|
702 |
elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
|
703 |
else:
|
|
|
564 |
run_input: Dict[str, Any] = {} # Initialize empty dict
|
565 |
|
566 |
if actor_id == APIFY_ACTOR_ID:
|
567 |
+
# Input specific to the default YT actor (1s7eXiaukVuOr4Ueg) - Requires LIST
|
568 |
run_input = {
|
569 |
+
"urls": [url], # <<< LIST format needed here
|
570 |
"maxRetries": 5,
|
571 |
"channelHandleBoolean": False,
|
572 |
"channelNameBoolean": False,
|
|
|
574 |
"relativeDateTextBoolean": False
|
575 |
}
|
576 |
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
|
577 |
+
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
|
578 |
+
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
|
579 |
+
# Based on the error message "Field input.urls must be string"
|
580 |
run_input = {
|
581 |
+
"urls": url # <<< STRING format needed here, not list
|
|
|
582 |
}
|
583 |
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
|
584 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
585 |
+
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
|
586 |
run_input = {
|
587 |
+
"urls": [url] # <<< Assume LIST format standard here
|
588 |
}
|
589 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
590 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
591 |
+
# Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS) - Uses startUrls
|
592 |
run_input = {
|
593 |
+
"startUrls": [{"url": url}], # <<< Different structure entirely
|
594 |
"maxCrawlPages": 1,
|
595 |
"crawlerType": "playwright:firefox" # Or adjust as needed
|
|
|
596 |
}
|
597 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
598 |
else:
|
599 |
# Fallback default input if actor ID doesn't match known ones
|
600 |
# Using the simple {"urls": [url]} format seems safest for generic text/content extractors
|
601 |
+
run_input = {"urls": [url]} # <<< Default to LIST
|
602 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
603 |
|
604 |
|
605 |
headers = {"Content-Type": "application/json"}
|
606 |
try:
|
607 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
608 |
+
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
|
609 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
610 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
611 |
|
|
|
622 |
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
623 |
content = None # Reset content
|
624 |
|
625 |
+
# --- REFINED PARSING LOGIC (Handles output from various actors) ---
|
626 |
if "text" in item and isinstance(item["text"], str):
|
627 |
logger.info(f"{log_prefix} Found text content in 'text' field.")
|
628 |
content = item["text"]
|
|
|
689 |
return None
|
690 |
# --- End of success processing logic ---
|
691 |
except json.JSONDecodeError:
|
692 |
+
# Check if the raw text looks like a transcript if JSON fails
|
693 |
+
raw_text = response.text
|
694 |
+
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
|
695 |
+
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
|
696 |
+
return raw_text.strip()
|
697 |
+
else:
|
698 |
+
logger.error(f"{log_prefix} Failed JSON decode and no usable raw text found. Status:{response.status_code}. Resp:{raw_text[:200]}");
|
699 |
+
return None
|
700 |
except Exception as e:
|
701 |
logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
|
702 |
return None
|
703 |
# Error handling for API call itself
|
704 |
+
elif response.status_code == 400:
|
705 |
+
# Log the specific error message from the API response if available
|
706 |
+
error_msg = response.text[:200] # Default
|
707 |
+
try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
|
708 |
+
except Exception: pass
|
709 |
+
logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. API Msg: '{error_msg}'"); return None
|
710 |
elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
|
711 |
elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
|
712 |
else:
|