Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -528,20 +528,32 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
528 |
"channelHandleBoolean": False,
|
529 |
"channelNameBoolean": False,
|
530 |
"datePublishedBoolean": False,
|
531 |
-
"relativeDateTextBoolean": False
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
}
|
533 |
-
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
|
534 |
-
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
|
535 |
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
|
536 |
-
# Based on the error message "Field input.urls must be string"
|
537 |
run_input = {
|
538 |
-
"urls": url # <<< STRING format needed here, not list
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
}
|
540 |
-
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
|
541 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
542 |
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
|
543 |
run_input = {
|
544 |
"urls": [url] # <<< Assume LIST format standard here
|
|
|
545 |
}
|
546 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
547 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
@@ -549,24 +561,30 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
549 |
run_input = {
|
550 |
"startUrls": [{"url": url}], # <<< Different structure entirely
|
551 |
"maxCrawlPages": 1,
|
552 |
-
"crawlerType": "playwright:firefox" # Or adjust as needed
|
|
|
|
|
|
|
|
|
|
|
|
|
553 |
}
|
554 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
555 |
else:
|
556 |
# Fallback default input if actor ID doesn't match known ones
|
557 |
-
# Using the simple {"urls": [url]} format seems safest for generic text/content extractors
|
558 |
run_input = {"urls": [url]} # <<< Default to LIST
|
559 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
560 |
|
561 |
|
562 |
headers = {"Content-Type": "application/json"}
|
563 |
try:
|
564 |
-
|
565 |
-
|
|
|
566 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
567 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
568 |
|
569 |
-
# --- Start of response processing
|
570 |
if response.status_code in [200, 201]:
|
571 |
if response.status_code == 201:
|
572 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
@@ -575,8 +593,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
575 |
results = response.json(); content = None
|
576 |
if isinstance(results, list) and len(results) > 0:
|
577 |
item = results[0]
|
578 |
-
# Optional: Re-enable for deep debugging if needed
|
579 |
-
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
580 |
content = None # Reset content
|
581 |
|
582 |
# --- REFINED PARSING LOGIC (Handles output from various actors) ---
|
@@ -590,24 +606,20 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
590 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
591 |
content = item["markdown"]
|
592 |
elif "captions" in item and isinstance(item["captions"], str):
|
593 |
-
# This case might still happen if the actor *sometimes* returns string
|
594 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
595 |
content = item["captions"]
|
596 |
-
# --- MODIFIED LIST HANDLING FOR CAPTIONS
|
597 |
elif "captions" in item and isinstance(item["captions"], list):
|
598 |
-
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing
|
599 |
transcript_parts = []
|
600 |
if not item["captions"]: # Handle empty list case
|
601 |
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
602 |
else:
|
603 |
-
# Check the type of the *first* element to decide parsing strategy
|
604 |
first_element = item["captions"][0]
|
605 |
if isinstance(first_element, str):
|
606 |
-
# Assume list of strings (Example 1 in docs)
|
607 |
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
608 |
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
609 |
elif isinstance(first_element, dict) and "text" in first_element:
|
610 |
-
# Assume list of dictionaries (Example 2 in docs)
|
611 |
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
612 |
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
613 |
else:
|
@@ -635,18 +647,15 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
635 |
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
636 |
return content.strip()
|
637 |
else:
|
638 |
-
# Log failure after trying all parsing methods
|
639 |
content_len = len(content) if content and isinstance(content, str) else 0
|
640 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
641 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
642 |
return None # Return None if no valid content found
|
643 |
else:
|
644 |
-
# Handle empty dataset list '[]' or non-list response
|
645 |
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
|
646 |
return None
|
647 |
# --- End of success processing logic ---
|
648 |
except json.JSONDecodeError:
|
649 |
-
# Check if the raw text looks like a transcript if JSON fails
|
650 |
raw_text = response.text
|
651 |
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
|
652 |
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
|
@@ -659,7 +668,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
659 |
return None
|
660 |
# Error handling for API call itself
|
661 |
elif response.status_code == 400:
|
662 |
-
# Log the specific error message from the API response if available
|
663 |
error_msg = response.text[:200] # Default
|
664 |
try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
|
665 |
except Exception: pass
|
@@ -671,7 +679,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
671 |
return None
|
672 |
# Error handling for network/client issues
|
673 |
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
674 |
-
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
|
675 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
676 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
677 |
|
|
|
528 |
"channelHandleBoolean": False,
|
529 |
"channelNameBoolean": False,
|
530 |
"datePublishedBoolean": False,
|
531 |
+
"relativeDateTextBoolean": False,
|
532 |
+
# --- ADDED RESIDENTIAL PROXY CONFIG ---
|
533 |
+
"proxyConfiguration": {
|
534 |
+
"useApifyProxy": True,
|
535 |
+
"apifyProxyGroups": ["RESIDENTIAL"]
|
536 |
+
}
|
537 |
+
# --- END ADDED PROXY CONFIG ---
|
538 |
}
|
539 |
+
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
|
540 |
+
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
|
541 |
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
|
|
|
542 |
run_input = {
|
543 |
+
"urls": url, # <<< STRING format needed here, not list
|
544 |
+
# --- ADDED RESIDENTIAL PROXY CONFIG ---
|
545 |
+
"proxyConfiguration": {
|
546 |
+
"useApifyProxy": True,
|
547 |
+
"apifyProxyGroups": ["RESIDENTIAL"]
|
548 |
+
}
|
549 |
+
# --- END ADDED PROXY CONFIG ---
|
550 |
}
|
551 |
+
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")
|
552 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
553 |
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
|
554 |
run_input = {
|
555 |
"urls": [url] # <<< Assume LIST format standard here
|
556 |
+
# Note: Proxy config not added here by default, could be added if needed
|
557 |
}
|
558 |
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
559 |
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
|
|
561 |
run_input = {
|
562 |
"startUrls": [{"url": url}], # <<< Different structure entirely
|
563 |
"maxCrawlPages": 1,
|
564 |
+
"crawlerType": "playwright:firefox", # Or adjust as needed
|
565 |
+
# Note: Proxy config not added here by default, but Website Crawler often needs it.
|
566 |
+
# Example if needed:
|
567 |
+
# "proxyConfiguration": {
|
568 |
+
# "useApifyProxy": True,
|
569 |
+
# "apifyProxyGroups": ["RESIDENTIAL"] # Or other groups
|
570 |
+
# }
|
571 |
}
|
572 |
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
573 |
else:
|
574 |
# Fallback default input if actor ID doesn't match known ones
|
|
|
575 |
run_input = {"urls": [url]} # <<< Default to LIST
|
576 |
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
577 |
|
578 |
|
579 |
headers = {"Content-Type": "application/json"}
|
580 |
try:
|
581 |
+
# Increased timeout for potentially longer residential proxy connections/actor runs
|
582 |
+
async with httpx.AsyncClient(timeout=180.0) as client:
|
583 |
+
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent
|
584 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
585 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
586 |
|
587 |
+
# --- Start of response processing ---
|
588 |
if response.status_code in [200, 201]:
|
589 |
if response.status_code == 201:
|
590 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
|
|
593 |
results = response.json(); content = None
|
594 |
if isinstance(results, list) and len(results) > 0:
|
595 |
item = results[0]
|
|
|
|
|
596 |
content = None # Reset content
|
597 |
|
598 |
# --- REFINED PARSING LOGIC (Handles output from various actors) ---
|
|
|
606 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
607 |
content = item["markdown"]
|
608 |
elif "captions" in item and isinstance(item["captions"], str):
|
|
|
609 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
610 |
content = item["captions"]
|
611 |
+
# --- MODIFIED LIST HANDLING FOR CAPTIONS ---
|
612 |
elif "captions" in item and isinstance(item["captions"], list):
|
613 |
+
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing...")
|
614 |
transcript_parts = []
|
615 |
if not item["captions"]: # Handle empty list case
|
616 |
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
617 |
else:
|
|
|
618 |
first_element = item["captions"][0]
|
619 |
if isinstance(first_element, str):
|
|
|
620 |
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
621 |
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
622 |
elif isinstance(first_element, dict) and "text" in first_element:
|
|
|
623 |
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
624 |
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
625 |
else:
|
|
|
647 |
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
648 |
return content.strip()
|
649 |
else:
|
|
|
650 |
content_len = len(content) if content and isinstance(content, str) else 0
|
651 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
652 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
653 |
return None # Return None if no valid content found
|
654 |
else:
|
|
|
655 |
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
|
656 |
return None
|
657 |
# --- End of success processing logic ---
|
658 |
except json.JSONDecodeError:
|
|
|
659 |
raw_text = response.text
|
660 |
if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
|
661 |
logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
|
|
|
668 |
return None
|
669 |
# Error handling for API call itself
|
670 |
elif response.status_code == 400:
|
|
|
671 |
error_msg = response.text[:200] # Default
|
672 |
try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
|
673 |
except Exception: pass
|
|
|
679 |
return None
|
680 |
# Error handling for network/client issues
|
681 |
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
682 |
+
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
|
683 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
684 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
685 |
|