Spaces:

muhammadsalmanalfaridzi
/

YT-Trend

Paused

App Files Files Community

YT-Trend / crawl4ai_scrapper-1.py

muhammadsalmanalfaridzi

Rename crawl4ai_scrapper.py to crawl4ai_scrapper-1.py

de865d0 verified about 2 months ago

raw

history blame contribute delete

4.31 kB

	import asyncio
	import re
	import logging
	from datetime import datetime
	from typing import List, Dict, Optional
	from tqdm import tqdm

	from crawl4ai import AsyncWebCrawler
	from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class YouTubeScraper:
	YOUTUBE_SCHEMA = {
	"name": "YouTubeVideoData",
	"baseSelector": "ytd-rich-item-renderer",
	"fields": [
	{"name": "title", "selector": "#video-title", "type": "text"},
	{"name": "url", "selector": "#video-title", "type": "link"},
	{"name": "views", "selector": "#metadata-line span:first-child", "type": "text"},
	{"name": "upload_date", "selector": "#metadata-line span:last-child", "type": "text"},
	{"name": "transcript", "selector": "#segments-container", "type": "text"}
	]
	}

	def __init__(self):
	self.crawler = AsyncWebCrawler(
	headless=True,
	browser="chromium",
	stealth=True,
	timeout=60
	)

	async def scrape_channel(self, url: str, start_date: str, end_date: str, max_videos: int = 10):
	"""Scrape and process YouTube channel content"""
	try:
	logger.info(f"Scraping channel: {url}")
	result = await self.crawler.arun(
	url=url,
	extraction_strategy=JsonCssExtractionStrategy(self.YOUTUBE_SCHEMA),
	wait_for_selector="#video-title"
	)
	return self._process_results(result.data, start_date, end_date, max_videos)
	except Exception as e:
	logger.error(f"Failed to scrape {url}: {str(e)}")
	return []

	def _process_results(self, raw_data: List[Dict], start_date: str, end_date: str, max_videos: int):
	"""Process and filter scraped data"""
	processed = []
	date_format = "%b %d, %Y"

	for item in raw_data[:max_videos]:
	try:
	if not item.get("url"):
	continue

	upload_date = datetime.strptime(item["upload_date"], date_format)
	start = datetime.strptime(start_date, "%Y-%m-%d")
	end = datetime.strptime(end_date, "%Y-%m-%d")

	if not (start <= upload_date <= end):
	continue

	processed.append({
	"id": self._extract_video_id(item["url"]),
	"title": item.get("title", "Untitled"),
	"url": f"https://youtube.com{item['url']}",
	"views": self._parse_views(item.get("views", "0")),
	"upload_date": upload_date.strftime("%Y-%m-%d"),
	"transcript": self._process_transcript(item.get("transcript", ""))
	})
	except Exception as e:
	logger.warning(f"Skipping invalid video data: {str(e)}")

	return processed

	@staticmethod
	def _parse_views(views_str: str) -> int:
	"""Convert view count string to integer"""
	return int(re.sub(r"[^\d]", "", views_str.split()[0])) if views_str else 0

	@staticmethod
	def _process_transcript(raw: str) -> List[Dict]:
	"""Structure raw transcript text"""
	return [{
	"start": i*5,
	"end": (i+1)*5,
	"text": line.strip()
	} for i, line in enumerate(raw.split("\n") if raw else [])]

	@staticmethod
	def _extract_video_id(url: str) -> Optional[str]:
	"""Extract YouTube video ID from URL"""
	match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
	return match.group(1) if match else None

	async def scrape_multiple_channels(urls: List[str], start_date: str, end_date: str, num_videos: int = 10):
	"""Scrape multiple YouTube channels with progress tracking"""
	scraper = YouTubeScraper()
	tasks = [scraper.scrape_channel(url, start_date, end_date, num_videos) for url in urls]

	with tqdm(total=len(tasks), desc="Processing Channels") as pbar:
	results = []
	for future in asyncio.as_completed(tasks):
	results.append(await future)
	pbar.update(1)

	return results