import json import time from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai.content_filter_strategy import PruningContentFilter # Define function for scraping with Crawl4AI async def trigger_scraping_channels(channel_urls, num_of_posts, start_date, end_date, order_by, country): """ Trigger scraping for multiple channel URLs with Crawl4AI. """ browser_config = BrowserConfig(headless=True, verbose=True) run_config = CrawlerRunConfig( cache_mode=CacheMode.ENABLED, markdown_generator=None, # Optionally, use a Markdown generator if needed content_filter=PruningContentFilter(threshold=0.5, threshold_type="fixed", min_word_threshold=0), ) async with AsyncWebCrawler(config=browser_config) as crawler: results = [] for url in channel_urls: result = await crawler.arun( url=url, config=run_config ) results.append(result.markdown) return results # Function to get the progress of the scraping task async def get_progress(snapshot_id): """ Get the progress of the scraping task. """ return {"status": "ready", "snapshot_id": snapshot_id} # Function to get the output of the scraping task async def get_output(snapshot_id, format="json"): """ Get the output of the scraping task. """ # Assuming we fetch the output after scraping and convert to JSON return [{"url": "https://example.com", "shortcode": "abc123", "formatted_transcript": []}]