|
import json |
|
import time |
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode |
|
from crawl4ai.content_filter_strategy import PruningContentFilter |
|
|
|
|
|
async def trigger_scraping_channels(channel_urls, num_of_posts, start_date, end_date, order_by, country): |
|
""" |
|
Trigger scraping for multiple channel URLs with Crawl4AI. |
|
""" |
|
browser_config = BrowserConfig(headless=True, verbose=True) |
|
run_config = CrawlerRunConfig( |
|
cache_mode=CacheMode.ENABLED, |
|
markdown_generator=None, |
|
content_filter=PruningContentFilter(threshold=0.5, threshold_type="fixed", min_word_threshold=0), |
|
) |
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler: |
|
results = [] |
|
for url in channel_urls: |
|
result = await crawler.arun( |
|
url=url, |
|
config=run_config |
|
) |
|
results.append(result.markdown) |
|
|
|
return results |
|
|
|
|
|
async def get_progress(snapshot_id): |
|
""" |
|
Get the progress of the scraping task. |
|
""" |
|
return {"status": "ready", "snapshot_id": snapshot_id} |
|
|
|
|
|
async def get_output(snapshot_id, format="json"): |
|
""" |
|
Get the output of the scraping task. |
|
""" |
|
|
|
return [{"url": "https://example.com", "shortcode": "abc123", "formatted_transcript": []}] |
|
|