Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -482,21 +482,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
484 |
params = {"token": api_token}
|
485 |
-
|
|
|
486 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
487 |
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
488 |
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
489 |
|
490 |
if is_yt_actor:
|
491 |
-
# Use input specific to the default YT actor
|
492 |
-
|
493 |
-
|
494 |
-
"
|
|
|
|
|
495 |
"channelHandleBoolean": False,
|
496 |
"channelNameBoolean": False,
|
497 |
"datePublishedBoolean": False,
|
498 |
"relativeDateTextBoolean": False }
|
499 |
-
logger.debug(f"{log_prefix} Using YouTube-specific input
|
500 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
501 |
run_input = { "urls": [url] }
|
502 |
logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
|
@@ -517,7 +520,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
517 |
results = response.json(); content = None
|
518 |
if isinstance(results, list) and len(results) > 0:
|
519 |
item = results[0]
|
520 |
-
# Optional:
|
521 |
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
522 |
content = None # Reset content
|
523 |
|
@@ -532,34 +535,45 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
532 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
533 |
content = item["markdown"]
|
534 |
elif "captions" in item and isinstance(item["captions"], str):
|
535 |
-
# This
|
536 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
537 |
content = item["captions"]
|
538 |
-
# ---
|
539 |
elif "captions" in item and isinstance(item["captions"], list):
|
540 |
-
logger.info(f"{log_prefix} Found 'captions' field as a list.
|
541 |
transcript_parts = []
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
if transcript_parts:
|
548 |
content = " ".join(transcript_parts).strip()
|
549 |
logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
|
550 |
else:
|
551 |
-
logger.warning(f"{log_prefix}
|
552 |
-
# --- END LIST HANDLING ---
|
553 |
elif "html" in item and isinstance(item["html"], str):
|
554 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
555 |
-
def parse_html_sync(html_str):
|
556 |
try:
|
557 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
558 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
559 |
except Exception as e:
|
560 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
561 |
return None
|
562 |
-
content = await asyncio.to_thread(parse_html_sync, item["html"])
|
563 |
|
564 |
# --- FINAL CONTENT CHECK ---
|
565 |
if content and isinstance(content, str) and len(content) > 30:
|
@@ -595,23 +609,21 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
595 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
596 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
597 |
|
598 |
-
# ---
|
599 |
-
#
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
#
|
605 |
-
#
|
606 |
-
|
607 |
-
|
608 |
-
#
|
609 |
-
|
610 |
-
|
611 |
-
#
|
612 |
-
|
613 |
-
# actor_name="Apify YT" # Use specific name for logging
|
614 |
-
# )
|
615 |
|
616 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
617 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
|
|
482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
484 |
params = {"token": api_token}
|
485 |
+
|
486 |
+
# --- Define base input, adjust for specific actors ---
|
487 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
488 |
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
489 |
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
490 |
|
491 |
if is_yt_actor:
|
492 |
+
# Use input specific to the default YT actor
|
493 |
+
# REMOVED outputFormat: "singleStringText" as it seems unreliable based on observed output
|
494 |
+
run_input = { "urls": [url], # Pass the URL correctly
|
495 |
+
# "outputFormat": "singleStringText", # <<< REMOVED THIS LINE
|
496 |
+
"maxRetries": 5, # Keep retries
|
497 |
+
# Keep other flags as they might affect which data is returned overall
|
498 |
"channelHandleBoolean": False,
|
499 |
"channelNameBoolean": False,
|
500 |
"datePublishedBoolean": False,
|
501 |
"relativeDateTextBoolean": False }
|
502 |
+
logger.debug(f"{log_prefix} Using YouTube-specific input (default array output expected)")
|
503 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
504 |
run_input = { "urls": [url] }
|
505 |
logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
|
|
|
520 |
results = response.json(); content = None
|
521 |
if isinstance(results, list) and len(results) > 0:
|
522 |
item = results[0]
|
523 |
+
# Optional: Re-enable for deep debugging if needed
|
524 |
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
525 |
content = None # Reset content
|
526 |
|
|
|
535 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
536 |
content = item["markdown"]
|
537 |
elif "captions" in item and isinstance(item["captions"], str):
|
538 |
+
# This case might still happen if the actor *sometimes* returns string
|
539 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
540 |
content = item["captions"]
|
541 |
+
# --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
|
542 |
elif "captions" in item and isinstance(item["captions"], list):
|
543 |
+
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
|
544 |
transcript_parts = []
|
545 |
+
if not item["captions"]: # Handle empty list case
|
546 |
+
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
547 |
+
else:
|
548 |
+
# Check the type of the *first* element to decide parsing strategy
|
549 |
+
first_element = item["captions"][0]
|
550 |
+
if isinstance(first_element, str):
|
551 |
+
# Assume list of strings (Example 1 in docs)
|
552 |
+
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
553 |
+
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
554 |
+
elif isinstance(first_element, dict) and "text" in first_element:
|
555 |
+
# Assume list of dictionaries (Example 2 in docs)
|
556 |
+
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
557 |
+
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
558 |
+
else:
|
559 |
+
logger.warning(f"{log_prefix} 'captions' list contains unexpected element types (first element type: {type(first_element)}). Cannot parse.")
|
560 |
+
|
561 |
if transcript_parts:
|
562 |
content = " ".join(transcript_parts).strip()
|
563 |
logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
|
564 |
else:
|
565 |
+
logger.warning(f"{log_prefix} Could not extract usable text from 'captions' list structure.")
|
566 |
+
# --- END MODIFIED LIST HANDLING ---
|
567 |
elif "html" in item and isinstance(item["html"], str):
|
568 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
569 |
+
def parse_html_sync(html_str):
|
570 |
try:
|
571 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
572 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
573 |
except Exception as e:
|
574 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
575 |
return None
|
576 |
+
content = await asyncio.to_thread(parse_html_sync, item["html"])
|
577 |
|
578 |
# --- FINAL CONTENT CHECK ---
|
579 |
if content and isinstance(content, str) and len(content) > 30:
|
|
|
609 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
610 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
611 |
|
612 |
+
# --- Ensure YT Transcript function uses the generic one ---
|
613 |
+
# You can simplify the get_transcript_via_apify function now
|
614 |
+
|
615 |
+
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
616 |
+
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
617 |
+
global APIFY_ACTOR_ID
|
618 |
+
# The specific run_input logic is now handled within _run_apify_actor_for_web_content
|
619 |
+
# when it detects the actor_id matches APIFY_ACTOR_ID
|
620 |
+
logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
|
621 |
+
return await _run_apify_actor_for_web_content(
|
622 |
+
url=video_url, # Pass video_url as the 'url' parameter
|
623 |
+
api_token=api_token,
|
624 |
+
actor_id=APIFY_ACTOR_ID,
|
625 |
+
actor_name="Apify YT" # Keep specific name for logging clarity
|
626 |
+
)
|
|
|
|
|
627 |
|
628 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
629 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|