Spaces:
Running
Running
import subprocess | |
subprocess.run(["playwright", "install"]) | |
subprocess.run(["playwright", "install-deps"]) | |
subprocess.run( | |
"apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1", | |
shell=True, | |
check=True, | |
) | |
import asyncio | |
import os | |
import json | |
import gradio as gr | |
from playwright.async_api import async_playwright | |
from urllib.parse import quote_plus | |
USER_AGENT = ( | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
"AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/91.0.4472.124 Safari/537.36" | |
) | |
def build_url(input_str): | |
input_str = input_str.strip() | |
if input_str.startswith("http://") or input_str.startswith("https://"): | |
return input_str | |
else: | |
categoria = quote_plus(input_str) | |
return f"https://www.deviantart.com/search?q={categoria}" | |
async def scrape_images(url, max_imgs): | |
max_imgs = max(10, min(max_imgs, 300)) | |
async with async_playwright() as p: | |
browser = await p.chromium.launch(headless=True) | |
context = await browser.new_context( | |
user_agent=USER_AGENT, | |
viewport={"width": 1366, "height": 768}, | |
) | |
cookies_env = os.getenv("COOKIES_JSON") | |
if cookies_env: | |
try: | |
cookies = json.loads(cookies_env) | |
for cookie in cookies: | |
cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize() | |
await context.add_cookies(cookies) | |
print("✅ Cookies cargadas desde variable de entorno") | |
except Exception as e: | |
print(f"⚠️ Error cargando cookies desde variable de entorno: {e}") | |
page = await context.new_page() | |
await page.set_extra_http_headers({ | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Accept-Language": "en-US,en;q=0.5", | |
"Referer": "https://www.deviantart.com/", | |
}) | |
await page.goto(url, timeout=60000) | |
await page.wait_for_load_state("networkidle") | |
collected_data = [] | |
seen_urls = set() | |
scroll_attempts = 0 | |
max_scroll_attempts = 30 | |
while len(collected_data) < max_imgs and scroll_attempts < max_scroll_attempts: | |
new_items = await page.evaluate("""() => Array.from(document.querySelectorAll('img[srcset]')).map(img => ({ | |
img_url: img.srcset.split(', ').pop().split(' ')[0], | |
user: img.alt || "Desconocido" | |
}))""") | |
for item in new_items: | |
if item["img_url"] not in seen_urls: | |
collected_data.append(item) | |
seen_urls.add(item["img_url"]) | |
if len(collected_data) < max_imgs: | |
await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""") | |
await page.wait_for_timeout(3000) | |
scroll_attempts += 1 | |
try: | |
await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000) | |
except: | |
pass | |
await browser.close() | |
return collected_data[:max_imgs] | |
def run_scraper(user_input, max_imgs): | |
url = build_url(user_input) | |
print(f"Usando URL: {url}") | |
return asyncio.run(scrape_images(url, int(max_imgs))) | |
def interface_fn(user_input, max_imgs): | |
results = run_scraper(user_input, max_imgs) | |
images = [(item["img_url"], f"Usuario: {item['user']}") for item in results] | |
return images | |
demo = gr.Interface( | |
fn=interface_fn, | |
inputs=[ | |
gr.Textbox(label="URL o Categoría DeviantArt", lines=1, | |
placeholder="Pega una URL o escribe una categoría o usuario"), | |
gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes") | |
], | |
outputs=gr.Gallery(label="Imágenes recolectadas"), | |
title="Scraper de Imágenes - DeviantArt", | |
description="Introduce una URL completa o solo una categoría/usuario para buscar imágenes." | |
) | |
if __name__ == "__main__": | |
demo.launch() | |