Spaces:

muhammadsalmanalfaridzi
/

YT-Trend

Paused

App Files Files Community

muhammadsalmanalfaridzi commited on Jan 26

Commit

8151600

verified ·

1 Parent(s): de865d0

Create crawl4ai_scrapper.py

Browse files

Files changed (1) hide show

crawl4ai_scrapper.py +42 -0

crawl4ai_scrapper.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json
+import time
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.content_filter_strategy import PruningContentFilter
+# Define function for scraping with Crawl4AI
+async def trigger_scraping_channels(channel_urls, num_of_posts, start_date, end_date, order_by, country):
+    """
+    Trigger scraping for multiple channel URLs with Crawl4AI.
+    """
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        markdown_generator=None,  # Optionally, use a Markdown generator if needed
+        content_filter=PruningContentFilter(threshold=0.5, threshold_type="fixed", min_word_threshold=0),
+    )
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = []
+        for url in channel_urls:
+            result = await crawler.arun(
+                url=url,
+                config=run_config
+            )
+            results.append(result.markdown)
+        return results
+# Function to get the progress of the scraping task
+async def get_progress(snapshot_id):
+    """
+    Get the progress of the scraping task.
+    """
+    return {"status": "ready", "snapshot_id": snapshot_id}
+# Function to get the output of the scraping task
+async def get_output(snapshot_id, format="json"):
+    """
+    Get the output of the scraping task.
+    """
+    # Assuming we fetch the output after scraping and convert to JSON
+    return [{"url": "https://example.com", "shortcode": "abc123", "formatted_transcript": []}]