import subprocess subprocess.run(["playwright", "install"]) subprocess.run(["playwright", "install-deps"]) subprocess.run( "apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1", shell=True, check=True, ) import asyncio import os import json import gradio as gr from playwright.async_api import async_playwright from urllib.parse import quote_plus USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/91.0.4472.124 Safari/537.36" ) def build_url(input_str): input_str = input_str.strip() if input_str.startswith("http://") or input_str.startswith("https://"): return input_str else: categoria = quote_plus(input_str) return f"https://www.deviantart.com/search?q={categoria}" async def scrape_images(url, max_imgs): max_imgs = max(10, min(max_imgs, 300)) async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent=USER_AGENT, viewport={"width": 1366, "height": 768}, ) cookies_env = os.getenv("COOKIES_JSON") if cookies_env: try: cookies = json.loads(cookies_env) for cookie in cookies: cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize() await context.add_cookies(cookies) print("✅ Cookies cargadas desde variable de entorno") except Exception as e: print(f"⚠️ Error cargando cookies desde variable de entorno: {e}") page = await context.new_page() await page.set_extra_http_headers({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Referer": "https://www.deviantart.com/", }) await page.goto(url, timeout=60000) await page.wait_for_load_state("networkidle") collected_data = [] seen_urls = set() scroll_attempts = 0 max_scroll_attempts = 30 while len(collected_data) < max_imgs and scroll_attempts < max_scroll_attempts: new_items = await page.evaluate("""() => Array.from(document.querySelectorAll('img[srcset]')).map(img => ({ img_url: img.srcset.split(', ').pop().split(' ')[0], user: img.alt || "Desconocido" }))""") for item in new_items: if item["img_url"] not in seen_urls: collected_data.append(item) seen_urls.add(item["img_url"]) if len(collected_data) < max_imgs: await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""") await page.wait_for_timeout(3000) scroll_attempts += 1 try: await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000) except: pass await browser.close() return collected_data[:max_imgs] def run_scraper(user_input, max_imgs): url = build_url(user_input) print(f"Usando URL: {url}") return asyncio.run(scrape_images(url, int(max_imgs))) def interface_fn(user_input, max_imgs): results = run_scraper(user_input, max_imgs) images = [(item["img_url"], f"Usuario: {item['user']}") for item in results] return images demo = gr.Interface( fn=interface_fn, inputs=[ gr.Textbox(label="URL o Categoría DeviantArt", lines=1, placeholder="Pega una URL o escribe una categoría o usuario"), gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes") ], outputs=gr.Gallery(label="Imágenes recolectadas"), title="Scraper de Imágenes - DeviantArt", description="Introduce una URL completa o solo una categoría/usuario para buscar imágenes." ) if __name__ == "__main__": demo.launch()