fmab777 commited on
Commit
1bb77fa
·
verified ·
1 Parent(s): de2d0f3

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +27 -18
main.py CHANGED
@@ -564,9 +564,9 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
564
  run_input: Dict[str, Any] = {} # Initialize empty dict
565
 
566
  if actor_id == APIFY_ACTOR_ID:
567
- # Input specific to the default YT actor (1s7eXiaukVuOr4Ueg)
568
  run_input = {
569
- "urls": [url],
570
  "maxRetries": 5,
571
  "channelHandleBoolean": False,
572
  "channelNameBoolean": False,
@@ -574,40 +574,38 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
574
  "relativeDateTextBoolean": False
575
  }
576
  logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
577
- elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- ADDED THIS BLOCK --- >>>
578
- # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR)
579
- # Based on the error message "Field input.urls is required"
580
  run_input = {
581
- "urls": [url]
582
- # Add other specific parameters for this actor if needed/known
583
  }
584
  logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
585
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
586
- # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR)
587
  run_input = {
588
- "urls": [url]
589
  }
590
  logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
591
  elif actor_id == APIFY_CRAWLER_ACTOR_ID:
592
- # Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS)
593
  run_input = {
594
- "startUrls": [{"url": url}],
595
  "maxCrawlPages": 1,
596
  "crawlerType": "playwright:firefox" # Or adjust as needed
597
- # Add other parameters specific to the crawler if necessary
598
  }
599
  logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
600
  else:
601
  # Fallback default input if actor ID doesn't match known ones
602
  # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
603
- run_input = {"urls": [url]}
604
  logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
605
 
606
 
607
  headers = {"Content-Type": "application/json"}
608
  try:
609
  async with httpx.AsyncClient(timeout=120.0) as client:
610
- logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {run_input}") # Log the input being sent
611
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
612
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
613
 
@@ -624,7 +622,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
624
  # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
625
  content = None # Reset content
626
 
627
- # --- REFINED PARSING LOGIC ---
628
  if "text" in item and isinstance(item["text"], str):
629
  logger.info(f"{log_prefix} Found text content in 'text' field.")
630
  content = item["text"]
@@ -691,13 +689,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
691
  return None
692
  # --- End of success processing logic ---
693
  except json.JSONDecodeError:
694
- logger.error(f"{log_prefix} Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
695
- return None
 
 
 
 
 
 
696
  except Exception as e:
697
  logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
698
  return None
699
  # Error handling for API call itself
700
- elif response.status_code == 400: logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
 
 
 
 
 
701
  elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
702
  elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
703
  else:
 
564
  run_input: Dict[str, Any] = {} # Initialize empty dict
565
 
566
  if actor_id == APIFY_ACTOR_ID:
567
+ # Input specific to the default YT actor (1s7eXiaukVuOr4Ueg) - Requires LIST
568
  run_input = {
569
+ "urls": [url], # <<< LIST format needed here
570
  "maxRetries": 5,
571
  "channelHandleBoolean": False,
572
  "channelNameBoolean": False,
 
574
  "relativeDateTextBoolean": False
575
  }
576
  logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
577
+ elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
578
+ # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
579
+ # Based on the error message "Field input.urls must be string"
580
  run_input = {
581
+ "urls": url # <<< STRING format needed here, not list
 
582
  }
583
  logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
584
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
585
+ # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
586
  run_input = {
587
+ "urls": [url] # <<< Assume LIST format standard here
588
  }
589
  logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
590
  elif actor_id == APIFY_CRAWLER_ACTOR_ID:
591
+ # Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS) - Uses startUrls
592
  run_input = {
593
+ "startUrls": [{"url": url}], # <<< Different structure entirely
594
  "maxCrawlPages": 1,
595
  "crawlerType": "playwright:firefox" # Or adjust as needed
 
596
  }
597
  logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
598
  else:
599
  # Fallback default input if actor ID doesn't match known ones
600
  # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
601
+ run_input = {"urls": [url]} # <<< Default to LIST
602
  logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
603
 
604
 
605
  headers = {"Content-Type": "application/json"}
606
  try:
607
  async with httpx.AsyncClient(timeout=120.0) as client:
608
+ logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
609
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
610
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
611
 
 
622
  # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
623
  content = None # Reset content
624
 
625
+ # --- REFINED PARSING LOGIC (Handles output from various actors) ---
626
  if "text" in item and isinstance(item["text"], str):
627
  logger.info(f"{log_prefix} Found text content in 'text' field.")
628
  content = item["text"]
 
689
  return None
690
  # --- End of success processing logic ---
691
  except json.JSONDecodeError:
692
+ # Check if the raw text looks like a transcript if JSON fails
693
+ raw_text = response.text
694
+ if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
695
+ logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
696
+ return raw_text.strip()
697
+ else:
698
+ logger.error(f"{log_prefix} Failed JSON decode and no usable raw text found. Status:{response.status_code}. Resp:{raw_text[:200]}");
699
+ return None
700
  except Exception as e:
701
  logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
702
  return None
703
  # Error handling for API call itself
704
+ elif response.status_code == 400:
705
+ # Log the specific error message from the API response if available
706
+ error_msg = response.text[:200] # Default
707
+ try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
708
+ except Exception: pass
709
+ logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. API Msg: '{error_msg}'"); return None
710
  elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
711
  elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
712
  else: