Update main.py
Browse files
main.py
CHANGED
|
@@ -482,93 +482,136 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
|
|
| 482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
| 483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
| 484 |
params = {"token": api_token}
|
|
|
|
| 485 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
headers = {"Content-Type": "application/json"}
|
| 488 |
try:
|
| 489 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 490 |
-
logger.debug(f"
|
| 491 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
| 492 |
-
logger.debug(f"
|
| 493 |
|
| 494 |
if response.status_code in [200, 201]:
|
| 495 |
if response.status_code == 201:
|
| 496 |
-
logger.info(f"
|
| 497 |
|
| 498 |
try:
|
| 499 |
results = response.json(); content = None
|
| 500 |
-
# Check if results is a list and not empty
|
| 501 |
if isinstance(results, list) and len(results) > 0:
|
| 502 |
item = results[0]
|
|
|
|
|
|
|
| 503 |
content = None # Reset content
|
| 504 |
|
| 505 |
-
#
|
| 506 |
if "text" in item and isinstance(item["text"], str):
|
|
|
|
| 507 |
content = item["text"]
|
| 508 |
elif "content" in item and isinstance(item["content"], str):
|
|
|
|
| 509 |
content = item["content"]
|
| 510 |
elif "markdown" in item and isinstance(item["markdown"], str):
|
|
|
|
| 511 |
content = item["markdown"]
|
| 512 |
elif "captions" in item and isinstance(item["captions"], str):
|
| 513 |
-
#
|
| 514 |
-
|
| 515 |
-
is_yt_actor = actor_id == APIFY_ACTOR_ID # Check if it's the specific YT actor
|
| 516 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
| 517 |
-
logger.info(f"{log_prefix} Found text content in 'captions' field.")
|
| 518 |
content = item["captions"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
elif "html" in item and isinstance(item["html"], str):
|
| 520 |
-
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
| 521 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
| 522 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
| 523 |
-
|
| 524 |
-
def parse_html_sync(html_str):
|
| 525 |
try:
|
| 526 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
| 527 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
| 528 |
except Exception as e:
|
| 529 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
| 530 |
return None
|
| 531 |
-
content = await asyncio.to_thread(parse_html_sync, item["html"])
|
| 532 |
|
| 533 |
-
#
|
| 534 |
if content and isinstance(content, str) and len(content) > 30:
|
| 535 |
-
|
| 536 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
| 537 |
-
# Use 'url' which is the function parameter
|
| 538 |
-
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Length: {len(content)}")
|
| 539 |
return content.strip()
|
| 540 |
else:
|
| 541 |
-
#
|
| 542 |
content_len = len(content) if content and isinstance(content, str) else 0
|
| 543 |
-
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
| 544 |
-
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
| 545 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
return None
|
| 549 |
-
# <<< CORRECTED INDENTATION FOR THIS ELSE >>>
|
| 550 |
else:
|
| 551 |
# Handle empty dataset list '[]'
|
| 552 |
-
logger.warning(f"
|
| 553 |
return None
|
| 554 |
# --- End of success processing logic ---
|
| 555 |
except json.JSONDecodeError:
|
| 556 |
-
logger.error(f"
|
| 557 |
return None
|
| 558 |
except Exception as e:
|
| 559 |
-
logger.error(f"
|
| 560 |
return None
|
| 561 |
-
#
|
| 562 |
-
elif response.status_code == 400: logger.error(f"
|
| 563 |
-
elif response.status_code == 401: logger.error(f"
|
| 564 |
-
elif response.status_code == 404: logger.error(f"
|
| 565 |
-
else:
|
| 566 |
-
logger.error(f"
|
| 567 |
return None
|
| 568 |
-
|
| 569 |
-
except httpx.
|
| 570 |
-
except httpx.
|
| 571 |
-
except
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
|
| 573 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
| 574 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|
|
|
|
| 482 |
logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
|
| 483 |
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
|
| 484 |
params = {"token": api_token}
|
| 485 |
+
# Define base input, adjust for specific actors
|
| 486 |
run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
|
| 487 |
+
is_yt_actor = actor_id == APIFY_ACTOR_ID
|
| 488 |
+
log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
|
| 489 |
+
|
| 490 |
+
if is_yt_actor:
|
| 491 |
+
# Use input specific to the default YT actor if needed
|
| 492 |
+
run_input = { "urls": [video_url], # <--- Make sure video_url is passed correctly if this function is called directly for YT
|
| 493 |
+
"outputFormat": "singleStringText", # Keep trying this format
|
| 494 |
+
"maxRetries": 5,
|
| 495 |
+
"channelHandleBoolean": False,
|
| 496 |
+
"channelNameBoolean": False,
|
| 497 |
+
"datePublishedBoolean": False,
|
| 498 |
+
"relativeDateTextBoolean": False }
|
| 499 |
+
logger.debug(f"{log_prefix} Using YouTube-specific input: { {k:v for k,v in run_input.items() if k != 'urls'} }") # Don't log URL twice
|
| 500 |
+
elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
|
| 501 |
+
run_input = { "urls": [url] }
|
| 502 |
+
logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
|
| 503 |
+
# Add other actor-specific input adjustments here if necessary
|
| 504 |
+
|
| 505 |
headers = {"Content-Type": "application/json"}
|
| 506 |
try:
|
| 507 |
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 508 |
+
logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url}")
|
| 509 |
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
|
| 510 |
+
logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
|
| 511 |
|
| 512 |
if response.status_code in [200, 201]:
|
| 513 |
if response.status_code == 201:
|
| 514 |
+
logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
|
| 515 |
|
| 516 |
try:
|
| 517 |
results = response.json(); content = None
|
|
|
|
| 518 |
if isinstance(results, list) and len(results) > 0:
|
| 519 |
item = results[0]
|
| 520 |
+
# Optional: Add debug log here again if the next fix doesn't work
|
| 521 |
+
# logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
|
| 522 |
content = None # Reset content
|
| 523 |
|
| 524 |
+
# --- REFINED PARSING LOGIC ---
|
| 525 |
if "text" in item and isinstance(item["text"], str):
|
| 526 |
+
logger.info(f"{log_prefix} Found text content in 'text' field.")
|
| 527 |
content = item["text"]
|
| 528 |
elif "content" in item and isinstance(item["content"], str):
|
| 529 |
+
logger.info(f"{log_prefix} Found text content in 'content' field.")
|
| 530 |
content = item["content"]
|
| 531 |
elif "markdown" in item and isinstance(item["markdown"], str):
|
| 532 |
+
logger.info(f"{log_prefix} Found text content in 'markdown' field.")
|
| 533 |
content = item["markdown"]
|
| 534 |
elif "captions" in item and isinstance(item["captions"], str):
|
| 535 |
+
# This handles if outputFormat=singleStringText actually worked
|
| 536 |
+
logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
|
|
|
|
|
|
|
|
|
|
| 537 |
content = item["captions"]
|
| 538 |
+
# --- ADDED LIST HANDLING FOR CAPTIONS ---
|
| 539 |
+
elif "captions" in item and isinstance(item["captions"], list):
|
| 540 |
+
logger.info(f"{log_prefix} Found 'captions' field as a list. Attempting to extract text.")
|
| 541 |
+
transcript_parts = []
|
| 542 |
+
for segment in item["captions"]:
|
| 543 |
+
if isinstance(segment, dict) and "text" in segment and isinstance(segment["text"], str):
|
| 544 |
+
transcript_parts.append(segment["text"])
|
| 545 |
+
elif isinstance(segment, str): # Handle if it's sometimes just a list of strings
|
| 546 |
+
transcript_parts.append(segment)
|
| 547 |
+
if transcript_parts:
|
| 548 |
+
content = " ".join(transcript_parts).strip()
|
| 549 |
+
logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
|
| 550 |
+
else:
|
| 551 |
+
logger.warning(f"{log_prefix} 'captions' field was a list but contained no usable text segments.")
|
| 552 |
+
# --- END LIST HANDLING ---
|
| 553 |
elif "html" in item and isinstance(item["html"], str):
|
|
|
|
|
|
|
| 554 |
logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
|
| 555 |
+
def parse_html_sync(html_str): # Define sync function for threading
|
|
|
|
| 556 |
try:
|
| 557 |
soup = BeautifulSoup(html_str, DEFAULT_PARSER)
|
| 558 |
return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
|
| 559 |
except Exception as e:
|
| 560 |
logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
|
| 561 |
return None
|
| 562 |
+
content = await asyncio.to_thread(parse_html_sync, item["html"]) # Run in thread
|
| 563 |
|
| 564 |
+
# --- FINAL CONTENT CHECK ---
|
| 565 |
if content and isinstance(content, str) and len(content) > 30:
|
| 566 |
+
logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
|
|
|
|
|
|
|
|
|
|
| 567 |
return content.strip()
|
| 568 |
else:
|
| 569 |
+
# Log failure after trying all parsing methods
|
| 570 |
content_len = len(content) if content and isinstance(content, str) else 0
|
|
|
|
|
|
|
| 571 |
item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
|
| 572 |
+
logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
|
| 573 |
+
return None # Return None if no valid content found
|
|
|
|
|
|
|
| 574 |
else:
|
| 575 |
# Handle empty dataset list '[]'
|
| 576 |
+
logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
|
| 577 |
return None
|
| 578 |
# --- End of success processing logic ---
|
| 579 |
except json.JSONDecodeError:
|
| 580 |
+
logger.error(f"{log_prefix} Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
|
| 581 |
return None
|
| 582 |
except Exception as e:
|
| 583 |
+
logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
|
| 584 |
return None
|
| 585 |
+
# Error handling for API call itself
|
| 586 |
+
elif response.status_code == 400: logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
|
| 587 |
+
elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
|
| 588 |
+
elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
|
| 589 |
+
else:
|
| 590 |
+
logger.error(f"{log_prefix} Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}");
|
| 591 |
return None
|
| 592 |
+
# Error handling for network/client issues
|
| 593 |
+
except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
|
| 594 |
+
except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
|
| 595 |
+
except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
|
| 596 |
+
except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
|
| 597 |
+
|
| 598 |
+
# --- Important Note on Calling This Function ---
|
| 599 |
+
# Make sure that when `get_youtube_transcript` calls `get_transcript_via_apify`,
|
| 600 |
+
# it correctly passes the `video_url`. And if you refactor `get_transcript_via_apify`
|
| 601 |
+
# to use `_run_apify_actor_for_web_content` directly, ensure the correct Apify Actor ID
|
| 602 |
+
# and the `video_url` are passed.
|
| 603 |
+
|
| 604 |
+
# Example refactor of get_transcript_via_apify (if you choose to do this):
|
| 605 |
+
# async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
|
| 606 |
+
# """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
|
| 607 |
+
# global APIFY_ACTOR_ID
|
| 608 |
+
# # Note: The run_input logic specific to the YT actor is now inside _run_apify_actor_for_web_content
|
| 609 |
+
# return await _run_apify_actor_for_web_content(
|
| 610 |
+
# url=video_url, # Pass video_url as the 'url' parameter
|
| 611 |
+
# api_token=api_token,
|
| 612 |
+
# actor_id=APIFY_ACTOR_ID,
|
| 613 |
+
# actor_name="Apify YT" # Use specific name for logging
|
| 614 |
+
# )
|
| 615 |
|
| 616 |
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
|
| 617 |
"""Fallback 4: Fetches website content using Apify Website Content Crawler."""
|