Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -558,36 +558,60 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
558 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
559 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
560 |
params = {"token": api_token}
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
580 |
-
|
581 |
-
|
582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
|
584 |
headers = {"Content-Type": "application/json"}
|
585 |
try:
|
586 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
587 |
-
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url}")
|
588 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
589 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
590 |
|
|
|
591 |
if response.status_code in [200, 201]:
|
592 |
if response.status_code == 201:
|
593 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
@@ -662,8 +686,8 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
662 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
663 |
return None # Return None if no valid content found
|
664 |
else:
|
665 |
-
# Handle empty dataset list '[]'
|
666 |
-
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
|
667 |
return None
|
668 |
# --- End of success processing logic ---
|
669 |
except json.JSONDecodeError:
|
@@ -685,9 +709,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
685 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
686 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
687 |
|
688 |
-
# --- Ensure YT Transcript function uses the generic one ---
|
689 |
-
# You can simplify the get_transcript_via_apify function now
|
690 |
-
|
691 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
692 |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
693 |
global APIFY_ACTOR_ID
|
|
|
558 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
559 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
560 |
params = {"token": api_token}
|
561 |
+
log_prefix = f"[{actor_name}]" # Use actor_name for logging prefix
|
562 |
+
|
563 |
+
# --- Define input based on actor ID ---
|
564 |
+
run_input: Dict[str, Any] = {} # Initialize empty dict
|
565 |
+
|
566 |
+
if actor_id == APIFY_ACTOR_ID:
|
567 |
+
# Input specific to the default YT actor (1s7eXiaukVuOr4Ueg)
|
568 |
+
run_input = {
|
569 |
+
"urls": [url],
|
570 |
+
"maxRetries": 5,
|
571 |
+
"channelHandleBoolean": False,
|
572 |
+
"channelNameBoolean": False,
|
573 |
+
"datePublishedBoolean": False,
|
574 |
+
"relativeDateTextBoolean": False
|
575 |
+
}
|
576 |
+
logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
|
577 |
+
elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- ADDED THIS BLOCK --- >>>
|
578 |
+
# Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR)
|
579 |
+
# Based on the error message "Field input.urls is required"
|
580 |
+
run_input = {
|
581 |
+
"urls": [url]
|
582 |
+
# Add other specific parameters for this actor if needed/known
|
583 |
+
}
|
584 |
+
logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
|
585 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
586 |
+
# Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR)
|
587 |
+
run_input = {
|
588 |
+
"urls": [url]
|
589 |
+
}
|
590 |
+
logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
|
591 |
+
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
|
592 |
+
# Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS)
|
593 |
+
run_input = {
|
594 |
+
"startUrls": [{"url": url}],
|
595 |
+
"maxCrawlPages": 1,
|
596 |
+
"crawlerType": "playwright:firefox" # Or adjust as needed
|
597 |
+
# Add other parameters specific to the crawler if necessary
|
598 |
+
}
|
599 |
+
logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
|
600 |
+
else:
|
601 |
+
# Fallback default input if actor ID doesn't match known ones
|
602 |
+
# Using the simple {"urls": [url]} format seems safest for generic text/content extractors
|
603 |
+
run_input = {"urls": [url]}
|
604 |
+
logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
|
605 |
+
|
606 |
|
607 |
headers = {"Content-Type": "application/json"}
|
608 |
try:
|
609 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
610 |
+
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {run_input}") # Log the input being sent
|
611 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
612 |
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
613 |
|
614 |
+
# --- Start of response processing (Remains the same as before) ---
|
615 |
if response.status_code in [200, 201]:
|
616 |
if response.status_code == 201:
|
617 |
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
|
|
686 |
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
687 |
return None # Return None if no valid content found
|
688 |
else:
|
689 |
+
# Handle empty dataset list '[]' or non-list response
|
690 |
+
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
|
691 |
return None
|
692 |
# --- End of success processing logic ---
|
693 |
except json.JSONDecodeError:
|
|
|
709 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
710 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
711 |
|
|
|
|
|
|
|
712 |
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
713 |
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
714 |
global APIFY_ACTOR_ID
|