fmab777 commited on
Commit
97fb38c
·
verified ·
1 Parent(s): 4afcd87

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +32 -24
main.py CHANGED
@@ -528,20 +528,32 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
528
  "channelHandleBoolean": False,
529
  "channelNameBoolean": False,
530
  "datePublishedBoolean": False,
531
- "relativeDateTextBoolean": False
 
 
 
 
 
 
532
  }
533
- logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
534
- elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
535
  # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
536
- # Based on the error message "Field input.urls must be string"
537
  run_input = {
538
- "urls": url # <<< STRING format needed here, not list
 
 
 
 
 
 
539
  }
540
- logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
541
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
542
  # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
543
  run_input = {
544
  "urls": [url] # <<< Assume LIST format standard here
 
545
  }
546
  logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
547
  elif actor_id == APIFY_CRAWLER_ACTOR_ID:
@@ -549,24 +561,30 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
549
  run_input = {
550
  "startUrls": [{"url": url}], # <<< Different structure entirely
551
  "maxCrawlPages": 1,
552
- "crawlerType": "playwright:firefox" # Or adjust as needed
 
 
 
 
 
 
553
  }
554
  logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
555
  else:
556
  # Fallback default input if actor ID doesn't match known ones
557
- # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
558
  run_input = {"urls": [url]} # <<< Default to LIST
559
  logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
560
 
561
 
562
  headers = {"Content-Type": "application/json"}
563
  try:
564
- async with httpx.AsyncClient(timeout=120.0) as client:
565
- logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
 
566
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
567
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
568
 
569
- # --- Start of response processing (Remains the same as before) ---
570
  if response.status_code in [200, 201]:
571
  if response.status_code == 201:
572
  logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
@@ -575,8 +593,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
575
  results = response.json(); content = None
576
  if isinstance(results, list) and len(results) > 0:
577
  item = results[0]
578
- # Optional: Re-enable for deep debugging if needed
579
- # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
580
  content = None # Reset content
581
 
582
  # --- REFINED PARSING LOGIC (Handles output from various actors) ---
@@ -590,24 +606,20 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
590
  logger.info(f"{log_prefix} Found text content in 'markdown' field.")
591
  content = item["markdown"]
592
  elif "captions" in item and isinstance(item["captions"], str):
593
- # This case might still happen if the actor *sometimes* returns string
594
  logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
595
  content = item["captions"]
596
- # --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
597
  elif "captions" in item and isinstance(item["captions"], list):
598
- logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
599
  transcript_parts = []
600
  if not item["captions"]: # Handle empty list case
601
  logger.warning(f"{log_prefix} 'captions' field is an empty list.")
602
  else:
603
- # Check the type of the *first* element to decide parsing strategy
604
  first_element = item["captions"][0]
605
  if isinstance(first_element, str):
606
- # Assume list of strings (Example 1 in docs)
607
  logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
608
  transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
609
  elif isinstance(first_element, dict) and "text" in first_element:
610
- # Assume list of dictionaries (Example 2 in docs)
611
  logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
612
  transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
613
  else:
@@ -635,18 +647,15 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
635
  logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
636
  return content.strip()
637
  else:
638
- # Log failure after trying all parsing methods
639
  content_len = len(content) if content and isinstance(content, str) else 0
640
  item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
641
  logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
642
  return None # Return None if no valid content found
643
  else:
644
- # Handle empty dataset list '[]' or non-list response
645
  logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
646
  return None
647
  # --- End of success processing logic ---
648
  except json.JSONDecodeError:
649
- # Check if the raw text looks like a transcript if JSON fails
650
  raw_text = response.text
651
  if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
652
  logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
@@ -659,7 +668,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
659
  return None
660
  # Error handling for API call itself
661
  elif response.status_code == 400:
662
- # Log the specific error message from the API response if available
663
  error_msg = response.text[:200] # Default
664
  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
665
  except Exception: pass
@@ -671,7 +679,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
671
  return None
672
  # Error handling for network/client issues
673
  except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
674
- except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
675
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
676
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
677
 
 
528
  "channelHandleBoolean": False,
529
  "channelNameBoolean": False,
530
  "datePublishedBoolean": False,
531
+ "relativeDateTextBoolean": False,
532
+ # --- ADDED RESIDENTIAL PROXY CONFIG ---
533
+ "proxyConfiguration": {
534
+ "useApifyProxy": True,
535
+ "apifyProxyGroups": ["RESIDENTIAL"]
536
+ }
537
+ # --- END ADDED PROXY CONFIG ---
538
  }
539
+ logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
540
+ elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
541
  # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
 
542
  run_input = {
543
+ "urls": url, # <<< STRING format needed here, not list
544
+ # --- ADDED RESIDENTIAL PROXY CONFIG ---
545
+ "proxyConfiguration": {
546
+ "useApifyProxy": True,
547
+ "apifyProxyGroups": ["RESIDENTIAL"]
548
+ }
549
+ # --- END ADDED PROXY CONFIG ---
550
  }
551
+ logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")
552
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
553
  # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
554
  run_input = {
555
  "urls": [url] # <<< Assume LIST format standard here
556
+ # Note: Proxy config not added here by default, could be added if needed
557
  }
558
  logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
559
  elif actor_id == APIFY_CRAWLER_ACTOR_ID:
 
561
  run_input = {
562
  "startUrls": [{"url": url}], # <<< Different structure entirely
563
  "maxCrawlPages": 1,
564
+ "crawlerType": "playwright:firefox", # Or adjust as needed
565
+ # Note: Proxy config not added here by default, but Website Crawler often needs it.
566
+ # Example if needed:
567
+ # "proxyConfiguration": {
568
+ # "useApifyProxy": True,
569
+ # "apifyProxyGroups": ["RESIDENTIAL"] # Or other groups
570
+ # }
571
  }
572
  logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
573
  else:
574
  # Fallback default input if actor ID doesn't match known ones
 
575
  run_input = {"urls": [url]} # <<< Default to LIST
576
  logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
577
 
578
 
579
  headers = {"Content-Type": "application/json"}
580
  try:
581
+ # Increased timeout for potentially longer residential proxy connections/actor runs
582
+ async with httpx.AsyncClient(timeout=180.0) as client:
583
+ logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent
584
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
585
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
586
 
587
+ # --- Start of response processing ---
588
  if response.status_code in [200, 201]:
589
  if response.status_code == 201:
590
  logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
 
593
  results = response.json(); content = None
594
  if isinstance(results, list) and len(results) > 0:
595
  item = results[0]
 
 
596
  content = None # Reset content
597
 
598
  # --- REFINED PARSING LOGIC (Handles output from various actors) ---
 
606
  logger.info(f"{log_prefix} Found text content in 'markdown' field.")
607
  content = item["markdown"]
608
  elif "captions" in item and isinstance(item["captions"], str):
 
609
  logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
610
  content = item["captions"]
611
+ # --- MODIFIED LIST HANDLING FOR CAPTIONS ---
612
  elif "captions" in item and isinstance(item["captions"], list):
613
+ logger.info(f"{log_prefix} Found 'captions' field as a list. Processing...")
614
  transcript_parts = []
615
  if not item["captions"]: # Handle empty list case
616
  logger.warning(f"{log_prefix} 'captions' field is an empty list.")
617
  else:
 
618
  first_element = item["captions"][0]
619
  if isinstance(first_element, str):
 
620
  logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
621
  transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
622
  elif isinstance(first_element, dict) and "text" in first_element:
 
623
  logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
624
  transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
625
  else:
 
647
  logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
648
  return content.strip()
649
  else:
 
650
  content_len = len(content) if content and isinstance(content, str) else 0
651
  item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
652
  logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
653
  return None # Return None if no valid content found
654
  else:
 
655
  logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
656
  return None
657
  # --- End of success processing logic ---
658
  except json.JSONDecodeError:
 
659
  raw_text = response.text
660
  if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
661
  logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
 
668
  return None
669
  # Error handling for API call itself
670
  elif response.status_code == 400:
 
671
  error_msg = response.text[:200] # Default
672
  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
673
  except Exception: pass
 
679
  return None
680
  # Error handling for network/client issues
681
  except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
682
+ except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
683
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
684
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
685