Devianart-gui / app.py
Emilianohack6950's picture
Update app.py
dab7fd8 verified
import subprocess
subprocess.run(["playwright", "install"])
subprocess.run(["playwright", "install-deps"])
subprocess.run(
"apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1",
shell=True,
check=True,
)
import asyncio
import os
import json
import gradio as gr
from playwright.async_api import async_playwright
from urllib.parse import quote_plus
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.124 Safari/537.36"
)
def build_url(input_str):
input_str = input_str.strip()
if input_str.startswith("http://") or input_str.startswith("https://"):
return input_str
else:
categoria = quote_plus(input_str)
return f"https://www.deviantart.com/search?q={categoria}"
async def scrape_images(url, max_imgs):
max_imgs = max(10, min(max_imgs, 300))
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1366, "height": 768},
)
cookies_env = os.getenv("COOKIES_JSON")
if cookies_env:
try:
cookies = json.loads(cookies_env)
for cookie in cookies:
cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize()
await context.add_cookies(cookies)
print("✅ Cookies cargadas desde variable de entorno")
except Exception as e:
print(f"⚠️ Error cargando cookies desde variable de entorno: {e}")
page = await context.new_page()
await page.set_extra_http_headers({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.deviantart.com/",
})
await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle")
collected_data = []
seen_urls = set()
scroll_attempts = 0
max_scroll_attempts = 30
while len(collected_data) < max_imgs and scroll_attempts < max_scroll_attempts:
new_items = await page.evaluate("""() => Array.from(document.querySelectorAll('img[srcset]')).map(img => ({
img_url: img.srcset.split(', ').pop().split(' ')[0],
user: img.alt || "Desconocido"
}))""")
for item in new_items:
if item["img_url"] not in seen_urls:
collected_data.append(item)
seen_urls.add(item["img_url"])
if len(collected_data) < max_imgs:
await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""")
await page.wait_for_timeout(3000)
scroll_attempts += 1
try:
await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000)
except:
pass
await browser.close()
return collected_data[:max_imgs]
def run_scraper(user_input, max_imgs):
url = build_url(user_input)
print(f"Usando URL: {url}")
return asyncio.run(scrape_images(url, int(max_imgs)))
def interface_fn(user_input, max_imgs):
results = run_scraper(user_input, max_imgs)
images = [(item["img_url"], f"Usuario: {item['user']}") for item in results]
return images
demo = gr.Interface(
fn=interface_fn,
inputs=[
gr.Textbox(label="URL o Categoría DeviantArt", lines=1,
placeholder="Pega una URL o escribe una categoría o usuario"),
gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
],
outputs=gr.Gallery(label="Imágenes recolectadas"),
title="Scraper de Imágenes - DeviantArt",
description="Introduce una URL completa o solo una categoría/usuario para buscar imágenes."
)
if __name__ == "__main__":
demo.launch()