Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -482,93 +482,136 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
484 |
params = {"token": api_token}
|
|
|
485 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
headers = {"Content-Type": "application/json"}
|
488 |
try:
|
489 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
490 |
-
logger.debug(f"
|
491 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
492 |
-
logger.debug(f"
|
493 |
|
494 |
if response.status_code in [200, 201]:
|
495 |
if response.status_code == 201:
|
496 |
-
logger.info(f"
|
497 |
|
498 |
try:
|
499 |
results = response.json(); content = None
|
500 |
-
# Check if results is a list and not empty
|
501 |
if isinstance(results, list) and len(results) > 0:
|
502 |
item = results[0]
|
|
|
|
|
503 |
content = None # Reset content
|
504 |
|
505 |
-
#
|
506 |
if "text" in item and isinstance(item["text"], str):
|
|
|
507 |
content = item["text"]
|
508 |
elif "content" in item and isinstance(item["content"], str):
|
|
|
509 |
content = item["content"]
|
510 |
elif "markdown" in item and isinstance(item["markdown"], str):
|
|
|
511 |
content = item["markdown"]
|
512 |
elif "captions" in item and isinstance(item["captions"], str):
|
513 |
-
#
|
514 |
-
|
515 |
-
is_yt_actor = actor_id == APIFY_ACTOR_ID # Check if it's the specific YT actor
|
516 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
517 |
-
logger.info(f"{log_prefix} Found text content in 'captions' field.")
|
518 |
content = item["captions"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
elif "html" in item and isinstance(item["html"], str):
|
520 |
-
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
521 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
522 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
523 |
-
|
524 |
-
def parse_html_sync(html_str):
|
525 |
try:
|
526 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
527 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
528 |
except Exception as e:
|
529 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
530 |
return None
|
531 |
-
content = await asyncio.to_thread(parse_html_sync, item["html"])
|
532 |
|
533 |
-
#
|
534 |
if content and isinstance(content, str) and len(content) > 30:
|
535 |
-
|
536 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
537 |
-
# Use 'url' which is the function parameter
|
538 |
-
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Length: {len(content)}")
|
539 |
return content.strip()
|
540 |
else:
|
541 |
-
#
|
542 |
content_len = len(content) if content and isinstance(content, str) else 0
|
543 |
-
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
544 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
545 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
546 |
-
|
547 |
-
|
548 |
-
return None
|
549 |
-
# <<< CORRECTED INDENTATION FOR THIS ELSE >>>
|
550 |
else:
|
551 |
# Handle empty dataset list '[]'
|
552 |
-
logger.warning(f"
|
553 |
return None
|
554 |
# --- End of success processing logic ---
|
555 |
except json.JSONDecodeError:
|
556 |
-
logger.error(f"
|
557 |
return None
|
558 |
except Exception as e:
|
559 |
-
logger.error(f"
|
560 |
return None
|
561 |
-
#
|
562 |
-
elif response.status_code == 400: logger.error(f"
|
563 |
-
elif response.status_code == 401: logger.error(f"
|
564 |
-
elif response.status_code == 404: logger.error(f"
|
565 |
-
else:
|
566 |
-
logger.error(f"
|
567 |
return None
|
568 |
-
|
569 |
-
except httpx.
|
570 |
-
except httpx.
|
571 |
-
except
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
572 |
|
573 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
574 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
|
|
482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
484 |
params = {"token": api_token}
|
485 |
+
# Define base input, adjust for specific actors
|
486 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
487 |
+
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
488 |
+
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
489 |
+
|
490 |
+
if is_yt_actor:
|
491 |
+
# Use input specific to the default YT actor if needed
|
492 |
+
run_input = { "urls": [video_url], # <--- Make sure video_url is passed correctly if this function is called directly for YT
|
493 |
+
"outputFormat": "singleStringText", # Keep trying this format
|
494 |
+
"maxRetries": 5,
|
495 |
+
"channelHandleBoolean": False,
|
496 |
+
"channelNameBoolean": False,
|
497 |
+
"datePublishedBoolean": False,
|
498 |
+
"relativeDateTextBoolean": False }
|
499 |
+
logger.debug(f"{log_prefix} Using YouTube-specific input: { {k:v for k,v in run_input.items() if k != 'urls'} }") # Don't log URL twice
|
500 |
+
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
501 |
+
run_input = { "urls": [url] }
|
502 |
+
logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
|
503 |
+
# Add other actor-specific input adjustments here if necessary
|
504 |
+
|
505 |
headers = {"Content-Type": "application/json"}
|
506 |
try:
|
507 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
508 |
+
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url}")
|
509 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
510 |
+
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
511 |
|
512 |
if response.status_code in [200, 201]:
|
513 |
if response.status_code == 201:
|
514 |
+
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
515 |
|
516 |
try:
|
517 |
results = response.json(); content = None
|
|
|
518 |
if isinstance(results, list) and len(results) > 0:
|
519 |
item = results[0]
|
520 |
+
# Optional: Add debug log here again if the next fix doesn't work
|
521 |
+
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
522 |
content = None # Reset content
|
523 |
|
524 |
+
# --- REFINED PARSING LOGIC ---
|
525 |
if "text" in item and isinstance(item["text"], str):
|
526 |
+
logger.info(f"{log_prefix} Found text content in 'text' field.")
|
527 |
content = item["text"]
|
528 |
elif "content" in item and isinstance(item["content"], str):
|
529 |
+
logger.info(f"{log_prefix} Found text content in 'content' field.")
|
530 |
content = item["content"]
|
531 |
elif "markdown" in item and isinstance(item["markdown"], str):
|
532 |
+
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
533 |
content = item["markdown"]
|
534 |
elif "captions" in item and isinstance(item["captions"], str):
|
535 |
+
# This handles if outputFormat=singleStringText actually worked
|
536 |
+
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
|
|
|
|
|
|
537 |
content = item["captions"]
|
538 |
+
# --- ADDED LIST HANDLING FOR CAPTIONS ---
|
539 |
+
elif "captions" in item and isinstance(item["captions"], list):
|
540 |
+
logger.info(f"{log_prefix} Found 'captions' field as a list. Attempting to extract text.")
|
541 |
+
transcript_parts = []
|
542 |
+
for segment in item["captions"]:
|
543 |
+
if isinstance(segment, dict) and "text" in segment and isinstance(segment["text"], str):
|
544 |
+
transcript_parts.append(segment["text"])
|
545 |
+
elif isinstance(segment, str): # Handle if it's sometimes just a list of strings
|
546 |
+
transcript_parts.append(segment)
|
547 |
+
if transcript_parts:
|
548 |
+
content = " ".join(transcript_parts).strip()
|
549 |
+
logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
|
550 |
+
else:
|
551 |
+
logger.warning(f"{log_prefix} 'captions' field was a list but contained no usable text segments.")
|
552 |
+
# --- END LIST HANDLING ---
|
553 |
elif "html" in item and isinstance(item["html"], str):
|
|
|
|
|
554 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
555 |
+
def parse_html_sync(html_str): # Define sync function for threading
|
|
|
556 |
try:
|
557 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
558 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
559 |
except Exception as e:
|
560 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
561 |
return None
|
562 |
+
content = await asyncio.to_thread(parse_html_sync, item["html"]) # Run in thread
|
563 |
|
564 |
+
# --- FINAL CONTENT CHECK ---
|
565 |
if content and isinstance(content, str) and len(content) > 30:
|
566 |
+
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
|
|
|
|
|
|
567 |
return content.strip()
|
568 |
else:
|
569 |
+
# Log failure after trying all parsing methods
|
570 |
content_len = len(content) if content and isinstance(content, str) else 0
|
|
|
|
|
571 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
572 |
+
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
573 |
+
return None # Return None if no valid content found
|
|
|
|
|
574 |
else:
|
575 |
# Handle empty dataset list '[]'
|
576 |
+
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
|
577 |
return None
|
578 |
# --- End of success processing logic ---
|
579 |
except json.JSONDecodeError:
|
580 |
+
logger.error(f"{log_prefix} Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
|
581 |
return None
|
582 |
except Exception as e:
|
583 |
+
logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
|
584 |
return None
|
585 |
+
# Error handling for API call itself
|
586 |
+
elif response.status_code == 400: logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
|
587 |
+
elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
|
588 |
+
elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
|
589 |
+
else:
|
590 |
+
logger.error(f"{log_prefix} Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}");
|
591 |
return None
|
592 |
+
# Error handling for network/client issues
|
593 |
+
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
594 |
+
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
|
595 |
+
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
596 |
+
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
597 |
+
|
598 |
+
# --- Important Note on Calling This Function ---
|
599 |
+
# Make sure that when `get_youtube_transcript` calls `get_transcript_via_apify`,
|
600 |
+
# it correctly passes the `video_url`. And if you refactor `get_transcript_via_apify`
|
601 |
+
# to use `_run_apify_actor_for_web_content` directly, ensure the correct Apify Actor ID
|
602 |
+
# and the `video_url` are passed.
|
603 |
+
|
604 |
+
# Example refactor of get_transcript_via_apify (if you choose to do this):
|
605 |
+
# async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
606 |
+
# """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
607 |
+
# global APIFY_ACTOR_ID
|
608 |
+
# # Note: The run_input logic specific to the YT actor is now inside _run_apify_actor_for_web_content
|
609 |
+
# return await _run_apify_actor_for_web_content(
|
610 |
+
# url=video_url, # Pass video_url as the 'url' parameter
|
611 |
+
# api_token=api_token,
|
612 |
+
# actor_id=APIFY_ACTOR_ID,
|
613 |
+
# actor_name="Apify YT" # Use specific name for logging
|
614 |
+
# )
|
615 |
|
616 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
617 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|