fmab777 commited on
Commit
72136b9
·
verified ·
1 Parent(s): 6f50520

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +84 -41
main.py CHANGED
@@ -482,93 +482,136 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
482
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
483
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
484
  params = {"token": api_token}
 
485
  run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
486
- if actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID: run_input = { "urls": [url] }; logger.debug(f"[{actor_name}] Using simplified input: {run_input}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  headers = {"Content-Type": "application/json"}
488
  try:
489
  async with httpx.AsyncClient(timeout=120.0) as client:
490
- logger.debug(f"[{actor_name}] POST Request to {sync_items_endpoint} for {url}")
491
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
492
- logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
493
 
494
  if response.status_code in [200, 201]:
495
  if response.status_code == 201:
496
- logger.info(f"[{actor_name}] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
497
 
498
  try:
499
  results = response.json(); content = None
500
- # Check if results is a list and not empty
501
  if isinstance(results, list) and len(results) > 0:
502
  item = results[0]
 
 
503
  content = None # Reset content
504
 
505
- # Parsing Logic - try extracting text from various possible keys
506
  if "text" in item and isinstance(item["text"], str):
 
507
  content = item["text"]
508
  elif "content" in item and isinstance(item["content"], str):
 
509
  content = item["content"]
510
  elif "markdown" in item and isinstance(item["markdown"], str):
 
511
  content = item["markdown"]
512
  elif "captions" in item and isinstance(item["captions"], str):
513
- # Specifically check if this is the YT actor by name or ID if needed,
514
- # otherwise this assumes 'captions' might contain text for other actors too.
515
- is_yt_actor = actor_id == APIFY_ACTOR_ID # Check if it's the specific YT actor
516
- log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
517
- logger.info(f"{log_prefix} Found text content in 'captions' field.")
518
  content = item["captions"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  elif "html" in item and isinstance(item["html"], str):
520
- is_yt_actor = actor_id == APIFY_ACTOR_ID
521
- log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
522
  logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
523
- # Run potentially blocking BS4 parsing in a separate thread
524
- def parse_html_sync(html_str):
525
  try:
526
  soup = BeautifulSoup(html_str, DEFAULT_PARSER)
527
  return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
528
  except Exception as e:
529
  logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
530
  return None
531
- content = await asyncio.to_thread(parse_html_sync, item["html"])
532
 
533
- # Check content validity AFTER attempting all parsing methods
534
  if content and isinstance(content, str) and len(content) > 30:
535
- is_yt_actor = actor_id == APIFY_ACTOR_ID
536
- log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
537
- # Use 'url' which is the function parameter
538
- logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Length: {len(content)}")
539
  return content.strip()
540
  else:
541
- # Content is invalid or parsing failed
542
  content_len = len(content) if content and isinstance(content, str) else 0
543
- is_yt_actor = actor_id == APIFY_ACTOR_ID
544
- log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
545
  item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
546
- # Use 'url' which is the function parameter
547
- logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format for {url}. Item keys: {item_keys_str}. Length: {content_len}")
548
- return None
549
- # <<< CORRECTED INDENTATION FOR THIS ELSE >>>
550
  else:
551
  # Handle empty dataset list '[]'
552
- logger.warning(f"[{actor_name}] Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
553
  return None
554
  # --- End of success processing logic ---
555
  except json.JSONDecodeError:
556
- logger.error(f"[{actor_name}] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
557
  return None
558
  except Exception as e:
559
- logger.error(f"[{actor_name}] Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
560
  return None
561
- # <<< The existing elif error handling remains the same >>>
562
- elif response.status_code == 400: logger.error(f"[{actor_name}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
563
- elif response.status_code == 401: logger.error(f"[{actor_name}] Auth error (401). Check token."); return None
564
- elif response.status_code == 404: logger.error(f"[{actor_name}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
565
- else: # Catches any other non-200/201 status
566
- logger.error(f"[{actor_name}] Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}");
567
  return None
568
- except httpx.TimeoutException as e: logger.error(f"[{actor_name}] Timeout during API interaction for {url}: {e}"); return None
569
- except httpx.HTTPStatusError as e: logger.error(f"[{actor_name}] HTTP Status Error during API interaction for {url}: {e}"); return None
570
- except httpx.RequestError as e: logger.error(f"[{actor_name}] Request error during API interaction for {url}: {e}"); return None
571
- except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
573
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
574
  """Fallback 4: Fetches website content using Apify Website Content Crawler."""
 
482
  logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
483
  sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
484
  params = {"token": api_token}
485
+ # Define base input, adjust for specific actors
486
  run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
487
+ is_yt_actor = actor_id == APIFY_ACTOR_ID
488
+ log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
489
+
490
+ if is_yt_actor:
491
+ # Use input specific to the default YT actor if needed
492
+ run_input = { "urls": [video_url], # <--- Make sure video_url is passed correctly if this function is called directly for YT
493
+ "outputFormat": "singleStringText", # Keep trying this format
494
+ "maxRetries": 5,
495
+ "channelHandleBoolean": False,
496
+ "channelNameBoolean": False,
497
+ "datePublishedBoolean": False,
498
+ "relativeDateTextBoolean": False }
499
+ logger.debug(f"{log_prefix} Using YouTube-specific input: { {k:v for k,v in run_input.items() if k != 'urls'} }") # Don't log URL twice
500
+ elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
501
+ run_input = { "urls": [url] }
502
+ logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
503
+ # Add other actor-specific input adjustments here if necessary
504
+
505
  headers = {"Content-Type": "application/json"}
506
  try:
507
  async with httpx.AsyncClient(timeout=120.0) as client:
508
+ logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url}")
509
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
510
+ logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
511
 
512
  if response.status_code in [200, 201]:
513
  if response.status_code == 201:
514
+ logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
515
 
516
  try:
517
  results = response.json(); content = None
 
518
  if isinstance(results, list) and len(results) > 0:
519
  item = results[0]
520
+ # Optional: Add debug log here again if the next fix doesn't work
521
+ # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
522
  content = None # Reset content
523
 
524
+ # --- REFINED PARSING LOGIC ---
525
  if "text" in item and isinstance(item["text"], str):
526
+ logger.info(f"{log_prefix} Found text content in 'text' field.")
527
  content = item["text"]
528
  elif "content" in item and isinstance(item["content"], str):
529
+ logger.info(f"{log_prefix} Found text content in 'content' field.")
530
  content = item["content"]
531
  elif "markdown" in item and isinstance(item["markdown"], str):
532
+ logger.info(f"{log_prefix} Found text content in 'markdown' field.")
533
  content = item["markdown"]
534
  elif "captions" in item and isinstance(item["captions"], str):
535
+ # This handles if outputFormat=singleStringText actually worked
536
+ logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
 
 
 
537
  content = item["captions"]
538
+ # --- ADDED LIST HANDLING FOR CAPTIONS ---
539
+ elif "captions" in item and isinstance(item["captions"], list):
540
+ logger.info(f"{log_prefix} Found 'captions' field as a list. Attempting to extract text.")
541
+ transcript_parts = []
542
+ for segment in item["captions"]:
543
+ if isinstance(segment, dict) and "text" in segment and isinstance(segment["text"], str):
544
+ transcript_parts.append(segment["text"])
545
+ elif isinstance(segment, str): # Handle if it's sometimes just a list of strings
546
+ transcript_parts.append(segment)
547
+ if transcript_parts:
548
+ content = " ".join(transcript_parts).strip()
549
+ logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
550
+ else:
551
+ logger.warning(f"{log_prefix} 'captions' field was a list but contained no usable text segments.")
552
+ # --- END LIST HANDLING ---
553
  elif "html" in item and isinstance(item["html"], str):
 
 
554
  logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
555
+ def parse_html_sync(html_str): # Define sync function for threading
 
556
  try:
557
  soup = BeautifulSoup(html_str, DEFAULT_PARSER)
558
  return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
559
  except Exception as e:
560
  logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
561
  return None
562
+ content = await asyncio.to_thread(parse_html_sync, item["html"]) # Run in thread
563
 
564
+ # --- FINAL CONTENT CHECK ---
565
  if content and isinstance(content, str) and len(content) > 30:
566
+ logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
 
 
 
567
  return content.strip()
568
  else:
569
+ # Log failure after trying all parsing methods
570
  content_len = len(content) if content and isinstance(content, str) else 0
 
 
571
  item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
572
+ logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
573
+ return None # Return None if no valid content found
 
 
574
  else:
575
  # Handle empty dataset list '[]'
576
+ logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
577
  return None
578
  # --- End of success processing logic ---
579
  except json.JSONDecodeError:
580
+ logger.error(f"{log_prefix} Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
581
  return None
582
  except Exception as e:
583
+ logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
584
  return None
585
+ # Error handling for API call itself
586
+ elif response.status_code == 400: logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
587
+ elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
588
+ elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
589
+ else:
590
+ logger.error(f"{log_prefix} Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}");
591
  return None
592
+ # Error handling for network/client issues
593
+ except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
594
+ except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
595
+ except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
596
+ except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
597
+
598
+ # --- Important Note on Calling This Function ---
599
+ # Make sure that when `get_youtube_transcript` calls `get_transcript_via_apify`,
600
+ # it correctly passes the `video_url`. And if you refactor `get_transcript_via_apify`
601
+ # to use `_run_apify_actor_for_web_content` directly, ensure the correct Apify Actor ID
602
+ # and the `video_url` are passed.
603
+
604
+ # Example refactor of get_transcript_via_apify (if you choose to do this):
605
+ # async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
606
+ # """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
607
+ # global APIFY_ACTOR_ID
608
+ # # Note: The run_input logic specific to the YT actor is now inside _run_apify_actor_for_web_content
609
+ # return await _run_apify_actor_for_web_content(
610
+ # url=video_url, # Pass video_url as the 'url' parameter
611
+ # api_token=api_token,
612
+ # actor_id=APIFY_ACTOR_ID,
613
+ # actor_name="Apify YT" # Use specific name for logging
614
+ # )
615
 
616
  async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
617
  """Fallback 4: Fetches website content using Apify Website Content Crawler."""