fmab777 commited on
Commit
16d7cee
·
verified ·
1 Parent(s): 72136b9

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +48 -36
main.py CHANGED
@@ -482,21 +482,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
482
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
483
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
484
  params = {"token": api_token}
485
- # Define base input, adjust for specific actors
 
486
  run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
487
  is_yt_actor = actor_id == APIFY_ACTOR_ID
488
  log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
489
 
490
  if is_yt_actor:
491
- # Use input specific to the default YT actor if needed
492
- run_input = { "urls": [video_url], # <--- Make sure video_url is passed correctly if this function is called directly for YT
493
- "outputFormat": "singleStringText", # Keep trying this format
494
- "maxRetries": 5,
 
 
495
  "channelHandleBoolean": False,
496
  "channelNameBoolean": False,
497
  "datePublishedBoolean": False,
498
  "relativeDateTextBoolean": False }
499
- logger.debug(f"{log_prefix} Using YouTube-specific input: { {k:v for k,v in run_input.items() if k != 'urls'} }") # Don't log URL twice
500
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
501
  run_input = { "urls": [url] }
502
  logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
@@ -517,7 +520,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
517
  results = response.json(); content = None
518
  if isinstance(results, list) and len(results) > 0:
519
  item = results[0]
520
- # Optional: Add debug log here again if the next fix doesn't work
521
  # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
522
  content = None # Reset content
523
 
@@ -532,34 +535,45 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
532
  logger.info(f"{log_prefix} Found text content in 'markdown' field.")
533
  content = item["markdown"]
534
  elif "captions" in item and isinstance(item["captions"], str):
535
- # This handles if outputFormat=singleStringText actually worked
536
  logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
537
  content = item["captions"]
538
- # --- ADDED LIST HANDLING FOR CAPTIONS ---
539
  elif "captions" in item and isinstance(item["captions"], list):
540
- logger.info(f"{log_prefix} Found 'captions' field as a list. Attempting to extract text.")
541
  transcript_parts = []
542
- for segment in item["captions"]:
543
- if isinstance(segment, dict) and "text" in segment and isinstance(segment["text"], str):
544
- transcript_parts.append(segment["text"])
545
- elif isinstance(segment, str): # Handle if it's sometimes just a list of strings
546
- transcript_parts.append(segment)
 
 
 
 
 
 
 
 
 
 
 
547
  if transcript_parts:
548
  content = " ".join(transcript_parts).strip()
549
  logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
550
  else:
551
- logger.warning(f"{log_prefix} 'captions' field was a list but contained no usable text segments.")
552
- # --- END LIST HANDLING ---
553
  elif "html" in item and isinstance(item["html"], str):
554
  logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
555
- def parse_html_sync(html_str): # Define sync function for threading
556
  try:
557
  soup = BeautifulSoup(html_str, DEFAULT_PARSER)
558
  return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
559
  except Exception as e:
560
  logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
561
  return None
562
- content = await asyncio.to_thread(parse_html_sync, item["html"]) # Run in thread
563
 
564
  # --- FINAL CONTENT CHECK ---
565
  if content and isinstance(content, str) and len(content) > 30:
@@ -595,23 +609,21 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
595
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
596
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
597
 
598
- # --- Important Note on Calling This Function ---
599
- # Make sure that when `get_youtube_transcript` calls `get_transcript_via_apify`,
600
- # it correctly passes the `video_url`. And if you refactor `get_transcript_via_apify`
601
- # to use `_run_apify_actor_for_web_content` directly, ensure the correct Apify Actor ID
602
- # and the `video_url` are passed.
603
-
604
- # Example refactor of get_transcript_via_apify (if you choose to do this):
605
- # async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
606
- # """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
607
- # global APIFY_ACTOR_ID
608
- # # Note: The run_input logic specific to the YT actor is now inside _run_apify_actor_for_web_content
609
- # return await _run_apify_actor_for_web_content(
610
- # url=video_url, # Pass video_url as the 'url' parameter
611
- # api_token=api_token,
612
- # actor_id=APIFY_ACTOR_ID,
613
- # actor_name="Apify YT" # Use specific name for logging
614
- # )
615
 
616
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
617
  """Fallback 4: Fetches website content using Apify Website Content Crawler."""
 
482
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
483
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
484
  params = {"token": api_token}
485
+
486
+ # --- Define base input, adjust for specific actors ---
487
  run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
488
  is_yt_actor = actor_id == APIFY_ACTOR_ID
489
  log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
490
 
491
  if is_yt_actor:
492
+ # Use input specific to the default YT actor
493
+ # REMOVED outputFormat: "singleStringText" as it seems unreliable based on observed output
494
+ run_input = { "urls": [url], # Pass the URL correctly
495
+ # "outputFormat": "singleStringText", # <<< REMOVED THIS LINE
496
+ "maxRetries": 5, # Keep retries
497
+ # Keep other flags as they might affect which data is returned overall
498
  "channelHandleBoolean": False,
499
  "channelNameBoolean": False,
500
  "datePublishedBoolean": False,
501
  "relativeDateTextBoolean": False }
502
+ logger.debug(f"{log_prefix} Using YouTube-specific input (default array output expected)")
503
  elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
504
  run_input = { "urls": [url] }
505
  logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
 
520
  results = response.json(); content = None
521
  if isinstance(results, list) and len(results) > 0:
522
  item = results[0]
523
+ # Optional: Re-enable for deep debugging if needed
524
  # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
525
  content = None # Reset content
526
 
 
535
  logger.info(f"{log_prefix} Found text content in 'markdown' field.")
536
  content = item["markdown"]
537
  elif "captions" in item and isinstance(item["captions"], str):
538
+ # This case might still happen if the actor *sometimes* returns string
539
  logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
540
  content = item["captions"]
541
+ # --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
542
  elif "captions" in item and isinstance(item["captions"], list):
543
+ logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
544
  transcript_parts = []
545
+ if not item["captions"]: # Handle empty list case
546
+ logger.warning(f"{log_prefix} 'captions' field is an empty list.")
547
+ else:
548
+ # Check the type of the *first* element to decide parsing strategy
549
+ first_element = item["captions"][0]
550
+ if isinstance(first_element, str):
551
+ # Assume list of strings (Example 1 in docs)
552
+ logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
553
+ transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
554
+ elif isinstance(first_element, dict) and "text" in first_element:
555
+ # Assume list of dictionaries (Example 2 in docs)
556
+ logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
557
+ transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
558
+ else:
559
+ logger.warning(f"{log_prefix} 'captions' list contains unexpected element types (first element type: {type(first_element)}). Cannot parse.")
560
+
561
  if transcript_parts:
562
  content = " ".join(transcript_parts).strip()
563
  logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
564
  else:
565
+ logger.warning(f"{log_prefix} Could not extract usable text from 'captions' list structure.")
566
+ # --- END MODIFIED LIST HANDLING ---
567
  elif "html" in item and isinstance(item["html"], str):
568
  logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
569
+ def parse_html_sync(html_str):
570
  try:
571
  soup = BeautifulSoup(html_str, DEFAULT_PARSER)
572
  return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
573
  except Exception as e:
574
  logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
575
  return None
576
+ content = await asyncio.to_thread(parse_html_sync, item["html"])
577
 
578
  # --- FINAL CONTENT CHECK ---
579
  if content and isinstance(content, str) and len(content) > 30:
 
609
  except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
610
  except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
611
 
612
+ # --- Ensure YT Transcript function uses the generic one ---
613
+ # You can simplify the get_transcript_via_apify function now
614
+
615
+ async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
616
+ """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
617
+ global APIFY_ACTOR_ID
618
+ # The specific run_input logic is now handled within _run_apify_actor_for_web_content
619
+ # when it detects the actor_id matches APIFY_ACTOR_ID
620
+ logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
621
+ return await _run_apify_actor_for_web_content(
622
+ url=video_url, # Pass video_url as the 'url' parameter
623
+ api_token=api_token,
624
+ actor_id=APIFY_ACTOR_ID,
625
+ actor_name="Apify YT" # Keep specific name for logging clarity
626
+ )
 
 
627
 
628
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
629
  """Fallback 4: Fetches website content using Apify Website Content Crawler."""