Update main.py
Browse files
main.py
CHANGED
|
@@ -482,21 +482,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
| 483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
| 484 |
params = {"token": api_token}
|
| 485 |
-
|
|
|
|
| 486 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
| 487 |
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
| 488 |
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
| 489 |
|
| 490 |
if is_yt_actor:
|
| 491 |
-
# Use input specific to the default YT actor
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
"
|
|
|
|
|
|
|
| 495 |
"channelHandleBoolean": False,
|
| 496 |
"channelNameBoolean": False,
|
| 497 |
"datePublishedBoolean": False,
|
| 498 |
"relativeDateTextBoolean": False }
|
| 499 |
-
logger.debug(f"{log_prefix} Using YouTube-specific input
|
| 500 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
| 501 |
run_input = { "urls": [url] }
|
| 502 |
logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
|
|
@@ -517,7 +520,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 517 |
results = response.json(); content = None
|
| 518 |
if isinstance(results, list) and len(results) > 0:
|
| 519 |
item = results[0]
|
| 520 |
-
# Optional:
|
| 521 |
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
| 522 |
content = None # Reset content
|
| 523 |
|
|
@@ -532,34 +535,45 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 532 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
| 533 |
content = item["markdown"]
|
| 534 |
elif "captions" in item and isinstance(item["captions"], str):
|
| 535 |
-
# This
|
| 536 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
| 537 |
content = item["captions"]
|
| 538 |
-
# ---
|
| 539 |
elif "captions" in item and isinstance(item["captions"], list):
|
| 540 |
-
logger.info(f"{log_prefix} Found 'captions' field as a list.
|
| 541 |
transcript_parts = []
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
if transcript_parts:
|
| 548 |
content = " ".join(transcript_parts).strip()
|
| 549 |
logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
|
| 550 |
else:
|
| 551 |
-
logger.warning(f"{log_prefix}
|
| 552 |
-
# --- END LIST HANDLING ---
|
| 553 |
elif "html" in item and isinstance(item["html"], str):
|
| 554 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
| 555 |
-
def parse_html_sync(html_str):
|
| 556 |
try:
|
| 557 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
| 558 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
| 559 |
except Exception as e:
|
| 560 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
| 561 |
return None
|
| 562 |
-
content = await asyncio.to_thread(parse_html_sync, item["html"])
|
| 563 |
|
| 564 |
# --- FINAL CONTENT CHECK ---
|
| 565 |
if content and isinstance(content, str) and len(content) > 30:
|
|
@@ -595,23 +609,21 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 595 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
| 596 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
| 597 |
|
| 598 |
-
# ---
|
| 599 |
-
#
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
#
|
| 605 |
-
#
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
#
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
#
|
| 612 |
-
|
| 613 |
-
# actor_name="Apify YT" # Use specific name for logging
|
| 614 |
-
# )
|
| 615 |
|
| 616 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
| 617 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
|
|
|
| 482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
| 483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
| 484 |
params = {"token": api_token}
|
| 485 |
+
|
| 486 |
+
# --- Define base input, adjust for specific actors ---
|
| 487 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
| 488 |
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
| 489 |
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
| 490 |
|
| 491 |
if is_yt_actor:
|
| 492 |
+
# Use input specific to the default YT actor
|
| 493 |
+
# REMOVED outputFormat: "singleStringText" as it seems unreliable based on observed output
|
| 494 |
+
run_input = { "urls": [url], # Pass the URL correctly
|
| 495 |
+
# "outputFormat": "singleStringText", # <<< REMOVED THIS LINE
|
| 496 |
+
"maxRetries": 5, # Keep retries
|
| 497 |
+
# Keep other flags as they might affect which data is returned overall
|
| 498 |
"channelHandleBoolean": False,
|
| 499 |
"channelNameBoolean": False,
|
| 500 |
"datePublishedBoolean": False,
|
| 501 |
"relativeDateTextBoolean": False }
|
| 502 |
+
logger.debug(f"{log_prefix} Using YouTube-specific input (default array output expected)")
|
| 503 |
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
| 504 |
run_input = { "urls": [url] }
|
| 505 |
logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
|
|
|
|
| 520 |
results = response.json(); content = None
|
| 521 |
if isinstance(results, list) and len(results) > 0:
|
| 522 |
item = results[0]
|
| 523 |
+
# Optional: Re-enable for deep debugging if needed
|
| 524 |
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
| 525 |
content = None # Reset content
|
| 526 |
|
|
|
|
| 535 |
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
| 536 |
content = item["markdown"]
|
| 537 |
elif "captions" in item and isinstance(item["captions"], str):
|
| 538 |
+
# This case might still happen if the actor *sometimes* returns string
|
| 539 |
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
| 540 |
content = item["captions"]
|
| 541 |
+
# --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
|
| 542 |
elif "captions" in item and isinstance(item["captions"], list):
|
| 543 |
+
logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
|
| 544 |
transcript_parts = []
|
| 545 |
+
if not item["captions"]: # Handle empty list case
|
| 546 |
+
logger.warning(f"{log_prefix} 'captions' field is an empty list.")
|
| 547 |
+
else:
|
| 548 |
+
# Check the type of the *first* element to decide parsing strategy
|
| 549 |
+
first_element = item["captions"][0]
|
| 550 |
+
if isinstance(first_element, str):
|
| 551 |
+
# Assume list of strings (Example 1 in docs)
|
| 552 |
+
logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
|
| 553 |
+
transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
|
| 554 |
+
elif isinstance(first_element, dict) and "text" in first_element:
|
| 555 |
+
# Assume list of dictionaries (Example 2 in docs)
|
| 556 |
+
logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
|
| 557 |
+
transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
|
| 558 |
+
else:
|
| 559 |
+
logger.warning(f"{log_prefix} 'captions' list contains unexpected element types (first element type: {type(first_element)}). Cannot parse.")
|
| 560 |
+
|
| 561 |
if transcript_parts:
|
| 562 |
content = " ".join(transcript_parts).strip()
|
| 563 |
logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
|
| 564 |
else:
|
| 565 |
+
logger.warning(f"{log_prefix} Could not extract usable text from 'captions' list structure.")
|
| 566 |
+
# --- END MODIFIED LIST HANDLING ---
|
| 567 |
elif "html" in item and isinstance(item["html"], str):
|
| 568 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
| 569 |
+
def parse_html_sync(html_str):
|
| 570 |
try:
|
| 571 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
| 572 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
| 573 |
except Exception as e:
|
| 574 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
| 575 |
return None
|
| 576 |
+
content = await asyncio.to_thread(parse_html_sync, item["html"])
|
| 577 |
|
| 578 |
# --- FINAL CONTENT CHECK ---
|
| 579 |
if content and isinstance(content, str) and len(content) > 30:
|
|
|
|
| 609 |
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
| 610 |
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
| 611 |
|
| 612 |
+
# --- Ensure YT Transcript function uses the generic one ---
|
| 613 |
+
# You can simplify the get_transcript_via_apify function now
|
| 614 |
+
|
| 615 |
+
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
| 616 |
+
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
| 617 |
+
global APIFY_ACTOR_ID
|
| 618 |
+
# The specific run_input logic is now handled within _run_apify_actor_for_web_content
|
| 619 |
+
# when it detects the actor_id matches APIFY_ACTOR_ID
|
| 620 |
+
logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
|
| 621 |
+
return await _run_apify_actor_for_web_content(
|
| 622 |
+
url=video_url, # Pass video_url as the 'url' parameter
|
| 623 |
+
api_token=api_token,
|
| 624 |
+
actor_id=APIFY_ACTOR_ID,
|
| 625 |
+
actor_name="Apify YT" # Keep specific name for logging clarity
|
| 626 |
+
)
|
|
|
|
|
|
|
| 627 |
|
| 628 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
| 629 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|