fmab777 commited on
Commit
de2d0f3
·
verified ·
1 Parent(s): 899ace9

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +48 -27
main.py CHANGED
@@ -558,36 +558,60 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
558
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
559
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
560
  params = {"token": api_token}
561
-
562
- # --- Define base input, adjust for specific actors ---
563
- run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
564
- is_yt_actor = actor_id == APIFY_ACTOR_ID
565
- log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
566
-
567
- if is_yt_actor:
568
- # Use input specific to the default YT actor
569
- # REMOVED outputFormat: "singleStringText" as it seems unreliable based on observed output
570
- run_input = { "urls": [url], # Pass the URL correctly
571
- # "outputFormat": "singleStringText", # <<< REMOVED THIS LINE
572
- "maxRetries": 5, # Keep retries
573
- # Keep other flags as they might affect which data is returned overall
574
- "channelHandleBoolean": False,
575
- "channelNameBoolean": False,
576
- "datePublishedBoolean": False,
577
- "relativeDateTextBoolean": False }
578
- logger.debug(f"{log_prefix} Using YouTube-specific input (default array output expected)")
 
 
 
 
 
 
579
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
580
- run_input = { "urls": [url] }
581
- logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
582
- # Add other actor-specific input adjustments here if necessary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
 
584
  headers = {"Content-Type": "application/json"}
585
  try:
586
  async with httpx.AsyncClient(timeout=120.0) as client:
587
- logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url}")
588
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
589
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
590
 
 
591
  if response.status_code in [200, 201]:
592
  if response.status_code == 201:
593
  logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
@@ -662,8 +686,8 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
662
  logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
663
  return None # Return None if no valid content found
664
  else:
665
- # Handle empty dataset list '[]'
666
- logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
667
  return None
668
  # --- End of success processing logic ---
669
  except json.JSONDecodeError:
@@ -685,9 +709,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
685
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
686
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
687
 
688
- # --- Ensure YT Transcript function uses the generic one ---
689
- # You can simplify the get_transcript_via_apify function now
690
-
691
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
692
  """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
693
  global APIFY_ACTOR_ID
 
558
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
559
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
560
  params = {"token": api_token}
561
+ log_prefix = f"[{actor_name}]" # Use actor_name for logging prefix
562
+
563
+ # --- Define input based on actor ID ---
564
+ run_input: Dict[str, Any] = {} # Initialize empty dict
565
+
566
+ if actor_id == APIFY_ACTOR_ID:
567
+ # Input specific to the default YT actor (1s7eXiaukVuOr4Ueg)
568
+ run_input = {
569
+ "urls": [url],
570
+ "maxRetries": 5,
571
+ "channelHandleBoolean": False,
572
+ "channelNameBoolean": False,
573
+ "datePublishedBoolean": False,
574
+ "relativeDateTextBoolean": False
575
+ }
576
+ logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
577
+ elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- ADDED THIS BLOCK --- >>>
578
+ # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR)
579
+ # Based on the error message "Field input.urls is required"
580
+ run_input = {
581
+ "urls": [url]
582
+ # Add other specific parameters for this actor if needed/known
583
+ }
584
+ logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
585
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
586
+ # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR)
587
+ run_input = {
588
+ "urls": [url]
589
+ }
590
+ logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
591
+ elif actor_id == APIFY_CRAWLER_ACTOR_ID:
592
+ # Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS)
593
+ run_input = {
594
+ "startUrls": [{"url": url}],
595
+ "maxCrawlPages": 1,
596
+ "crawlerType": "playwright:firefox" # Or adjust as needed
597
+ # Add other parameters specific to the crawler if necessary
598
+ }
599
+ logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
600
+ else:
601
+ # Fallback default input if actor ID doesn't match known ones
602
+ # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
603
+ run_input = {"urls": [url]}
604
+ logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
605
+
606
 
607
  headers = {"Content-Type": "application/json"}
608
  try:
609
  async with httpx.AsyncClient(timeout=120.0) as client:
610
+ logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {run_input}") # Log the input being sent
611
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
612
  logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
613
 
614
+ # --- Start of response processing (Remains the same as before) ---
615
  if response.status_code in [200, 201]:
616
  if response.status_code == 201:
617
  logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
 
686
  logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
687
  return None # Return None if no valid content found
688
  else:
689
+ # Handle empty dataset list '[]' or non-list response
690
+ logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
691
  return None
692
  # --- End of success processing logic ---
693
  except json.JSONDecodeError:
 
709
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
710
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
711
 
 
 
 
712
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
713
  """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
714
  global APIFY_ACTOR_ID