fmab777 commited on
Commit
6f50520
·
verified ·
1 Parent(s): d8c9e4d

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +49 -32
main.py CHANGED
@@ -491,47 +491,64 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
491
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
492
  logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
493
 
494
- # <<< MODIFIED check for success: Accept 200 OR 201 >>>
495
  if response.status_code in [200, 201]:
496
- # Log if it was 201, as it's slightly unexpected for sync but apparently happens
497
  if response.status_code == 201:
498
  logger.info(f"[{actor_name}] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
499
 
500
  try:
501
  results = response.json(); content = None
502
- # --- The rest of the success processing logic stays the same ---
503
  if isinstance(results, list) and len(results) > 0:
504
  item = results[0]
505
- content = None # Reset content
506
-
507
- # <<< MODIFIED PARSING LOGIC >>>
508
- if "text" in item and isinstance(item["text"], str):
509
- content = item["text"]
510
- elif "content" in item and isinstance(item["content"], str):
511
- content = item["content"]
512
- elif "markdown" in item and isinstance(item["markdown"], str):
513
- content = item["markdown"]
514
- # --- ADD THIS CHECK ---
515
- elif "captions" in item and isinstance(item["captions"], str):
516
- logger.info("[Apify YT] Found transcript text in 'captions' field.")
517
- content = item["captions"]
518
- # --- END ADDITION ---
519
- elif "html" in item and isinstance(item["html"], str):
520
- logger.warning(f"[Apify YT] No direct text/markdown/captions found, attempting to parse 'html' from result.")
521
- soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
522
- content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
523
-
524
- # Check content validity AFTER parsing
525
- if content and isinstance(content, str) and len(content) > 30:
526
- logger.info(f"[Apify YT] Success via REST (Status {response.status_code}) for {video_url}. Length: {len(content)}")
527
- return content.strip()
528
- else:
529
- content_len = len(content) if content and isinstance(content, str) else 0
530
- # Adjust warning message if needed to show 'captions' was checked
531
- logger.warning(f"[Apify YT] Dataset item parsed (Status {response.status_code}) but final transcript content empty/short/invalid format for {video_url}. Item keys: {list(item.keys())}. Length: {content_len}")
532
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  else:
534
- # Handle empty dataset list '[]' which might be returned with 200 or 201
535
  logger.warning(f"[{actor_name}] Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
536
  return None
537
  # --- End of success processing logic ---
 
491
  response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
492
  logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
493
 
 
494
  if response.status_code in [200, 201]:
 
495
  if response.status_code == 201:
496
  logger.info(f"[{actor_name}] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
497
 
498
  try:
499
  results = response.json(); content = None
500
+ # Check if results is a list and not empty
501
  if isinstance(results, list) and len(results) > 0:
502
  item = results[0]
503
+ content = None # Reset content
504
+
505
+ # Parsing Logic - try extracting text from various possible keys
506
+ if "text" in item and isinstance(item["text"], str):
507
+ content = item["text"]
508
+ elif "content" in item and isinstance(item["content"], str):
509
+ content = item["content"]
510
+ elif "markdown" in item and isinstance(item["markdown"], str):
511
+ content = item["markdown"]
512
+ elif "captions" in item and isinstance(item["captions"], str):
513
+ # Specifically check if this is the YT actor by name or ID if needed,
514
+ # otherwise this assumes 'captions' might contain text for other actors too.
515
+ is_yt_actor = actor_id == APIFY_ACTOR_ID # Check if it's the specific YT actor
516
+ log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
517
+ logger.info(f"{log_prefix} Found text content in 'captions' field.")
518
+ content = item["captions"]
519
+ elif "html" in item and isinstance(item["html"], str):
520
+ is_yt_actor = actor_id == APIFY_ACTOR_ID
521
+ log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
522
+ logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
523
+ # Run potentially blocking BS4 parsing in a separate thread
524
+ def parse_html_sync(html_str):
525
+ try:
526
+ soup = BeautifulSoup(html_str, DEFAULT_PARSER)
527
+ return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
528
+ except Exception as e:
529
+ logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
530
+ return None
531
+ content = await asyncio.to_thread(parse_html_sync, item["html"])
532
+
533
+ # Check content validity AFTER attempting all parsing methods
534
+ if content and isinstance(content, str) and len(content) > 30:
535
+ is_yt_actor = actor_id == APIFY_ACTOR_ID
536
+ log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
537
+ # Use 'url' which is the function parameter
538
+ logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Length: {len(content)}")
539
+ return content.strip()
540
+ else:
541
+ # Content is invalid or parsing failed
542
+ content_len = len(content) if content and isinstance(content, str) else 0
543
+ is_yt_actor = actor_id == APIFY_ACTOR_ID
544
+ log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
545
+ item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
546
+ # Use 'url' which is the function parameter
547
+ logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format for {url}. Item keys: {item_keys_str}. Length: {content_len}")
548
+ return None
549
+ # <<< CORRECTED INDENTATION FOR THIS ELSE >>>
550
  else:
551
+ # Handle empty dataset list '[]'
552
  logger.warning(f"[{actor_name}] Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
553
  return None
554
  # --- End of success processing logic ---