Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
|
3 |
+
subprocess.run(["playwright", "install"])
|
4 |
+
subprocess.run(["playwright", "install-deps"])
|
5 |
+
subprocess.run(
|
6 |
+
"apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1",
|
7 |
+
shell=True,
|
8 |
+
check=True,
|
9 |
+
)
|
10 |
+
import asyncio
|
11 |
+
import os
|
12 |
+
import json
|
13 |
+
import gradio as gr
|
14 |
+
from playwright.async_api import async_playwright
|
15 |
+
from urllib.parse import quote_plus
|
16 |
+
|
17 |
+
USER_AGENT = (
|
18 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
19 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
20 |
+
"Chrome/91.0.4472.124 Safari/537.36"
|
21 |
+
)
|
22 |
+
|
23 |
+
def build_url(input_str):
|
24 |
+
input_str = input_str.strip()
|
25 |
+
if input_str.startswith("http://") or input_str.startswith("https://"):
|
26 |
+
return input_str
|
27 |
+
else:
|
28 |
+
categoria = quote_plus(input_str)
|
29 |
+
return f"https://www.deviantart.com/search?q={categoria}"
|
30 |
+
|
31 |
+
async def scrape_images(url, max_imgs):
|
32 |
+
max_imgs = max(10, min(max_imgs, 300))
|
33 |
+
async with async_playwright() as p:
|
34 |
+
browser = await p.chromium.launch(headless=True)
|
35 |
+
context = await browser.new_context(
|
36 |
+
user_agent=USER_AGENT,
|
37 |
+
viewport={"width": 1366, "height": 768},
|
38 |
+
)
|
39 |
+
|
40 |
+
cookies_env = os.getenv("COOKIES_JSON")
|
41 |
+
if cookies_env:
|
42 |
+
try:
|
43 |
+
cookies = json.loads(cookies_env)
|
44 |
+
for cookie in cookies:
|
45 |
+
cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize()
|
46 |
+
await context.add_cookies(cookies)
|
47 |
+
print("✅ Cookies cargadas desde variable de entorno")
|
48 |
+
except Exception as e:
|
49 |
+
print(f"⚠️ Error cargando cookies desde variable de entorno: {e}")
|
50 |
+
|
51 |
+
page = await context.new_page()
|
52 |
+
await page.set_extra_http_headers({
|
53 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
54 |
+
"Accept-Language": "en-US,en;q=0.5",
|
55 |
+
"Referer": "https://www.deviantart.com/",
|
56 |
+
})
|
57 |
+
|
58 |
+
await page.goto(url, timeout=60000)
|
59 |
+
await page.wait_for_load_state("networkidle")
|
60 |
+
|
61 |
+
collected_data = []
|
62 |
+
seen_urls = set()
|
63 |
+
scroll_attempts = 0
|
64 |
+
max_scroll_attempts = 30
|
65 |
+
|
66 |
+
while len(collected_data) < max_imgs and scroll_attempts < max_scroll_attempts:
|
67 |
+
new_items = await page.evaluate("""() => Array.from(document.querySelectorAll('img[srcset]')).map(img => ({
|
68 |
+
img_url: img.srcset.split(', ').pop().split(' ')[0],
|
69 |
+
user: img.alt || "Desconocido"
|
70 |
+
}))""")
|
71 |
+
|
72 |
+
for item in new_items:
|
73 |
+
if item["img_url"] not in seen_urls:
|
74 |
+
collected_data.append(item)
|
75 |
+
seen_urls.add(item["img_url"])
|
76 |
+
|
77 |
+
if len(collected_data) < max_imgs:
|
78 |
+
await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""")
|
79 |
+
await page.wait_for_timeout(3000)
|
80 |
+
scroll_attempts += 1
|
81 |
+
try:
|
82 |
+
await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000)
|
83 |
+
except:
|
84 |
+
pass
|
85 |
+
|
86 |
+
await browser.close()
|
87 |
+
return collected_data[:max_imgs]
|
88 |
+
|
89 |
+
def run_scraper(user_input, max_imgs):
|
90 |
+
url = build_url(user_input)
|
91 |
+
print(f"Usando URL: {url}")
|
92 |
+
return asyncio.run(scrape_images(url, int(max_imgs)))
|
93 |
+
|
94 |
+
def interface_fn(user_input, max_imgs):
|
95 |
+
results = run_scraper(user_input, max_imgs)
|
96 |
+
images = [(item["img_url"], f"Usuario: {item['user']}") for item in results]
|
97 |
+
return images
|
98 |
+
|
99 |
+
demo = gr.Interface(
|
100 |
+
fn=interface_fn,
|
101 |
+
inputs=[
|
102 |
+
gr.Textbox(label="URL o Categoría DeviantArt", lines=1,
|
103 |
+
placeholder="Pega una URL o escribe una categoría o usuario"),
|
104 |
+
gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
|
105 |
+
],
|
106 |
+
outputs=gr.Gallery(label="Imágenes recolectadas").style(grid=[3], height="auto"),
|
107 |
+
title="Scraper de Imágenes - DeviantArt",
|
108 |
+
description="Introduce una URL completa o solo una categoría/usuario para buscar imágenes."
|
109 |
+
)
|
110 |
+
|
111 |
+
if __name__ == "__main__":
|
112 |
+
demo.launch()
|