Update app.py
Browse files
app.py
CHANGED
@@ -9,16 +9,10 @@ subprocess.run(
|
|
9 |
)
|
10 |
|
11 |
import asyncio
|
12 |
-
import gradio as gr
|
13 |
-
import json
|
14 |
import os
|
15 |
-
import
|
|
|
16 |
from playwright.async_api import async_playwright
|
17 |
-
from tqdm import tqdm
|
18 |
-
from dotenv import load_dotenv
|
19 |
-
|
20 |
-
# Cargar .env si existe (solo en local)
|
21 |
-
load_dotenv()
|
22 |
|
23 |
USER_AGENT = (
|
24 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
@@ -26,20 +20,8 @@ USER_AGENT = (
|
|
26 |
"Chrome/91.0.4472.124 Safari/537.36"
|
27 |
)
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
cookies_str = os.getenv("COOKIES_JSON", "[]")
|
32 |
-
cookies = json.loads(cookies_str)
|
33 |
-
for cookie in cookies:
|
34 |
-
cookie["sameSite"] = cookie.get("sameSite", "None").capitalize()
|
35 |
-
return cookies
|
36 |
-
|
37 |
-
# 🤖 Detectar si el input es URL o categoría
|
38 |
-
def es_url(input_str):
|
39 |
-
return re.match(r"^https?://", input_str.strip()) is not None
|
40 |
-
|
41 |
-
# 🧩 Scraping principal
|
42 |
-
async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
|
43 |
async with async_playwright() as p:
|
44 |
browser = await p.chromium.launch(headless=True)
|
45 |
context = await browser.new_context(
|
@@ -47,8 +29,17 @@ async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
|
|
47 |
viewport={"width": 1366, "height": 768},
|
48 |
)
|
49 |
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
page = await context.new_page()
|
54 |
await page.set_extra_http_headers({
|
@@ -57,7 +48,6 @@ async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
|
|
57 |
"Referer": "https://www.deviantart.com/",
|
58 |
})
|
59 |
|
60 |
-
url = input_value if es_url(input_value) else f"https://www.deviantart.com/search?q={input_value}"
|
61 |
await page.goto(url, timeout=60000)
|
62 |
await page.wait_for_load_state("networkidle")
|
63 |
|
@@ -78,25 +68,35 @@ async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
|
|
78 |
seen_urls.add(item["img_url"])
|
79 |
|
80 |
if len(collected_data) < max_imgs:
|
81 |
-
await page.evaluate("window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});")
|
82 |
await page.wait_for_timeout(3000)
|
83 |
scroll_attempts += 1
|
|
|
|
|
|
|
|
|
84 |
|
85 |
await browser.close()
|
86 |
-
return
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
)
|
10 |
|
11 |
import asyncio
|
|
|
|
|
12 |
import os
|
13 |
+
import json
|
14 |
+
import gradio as gr
|
15 |
from playwright.async_api import async_playwright
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
USER_AGENT = (
|
18 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
20 |
"Chrome/91.0.4472.124 Safari/537.36"
|
21 |
)
|
22 |
|
23 |
+
async def scrape_images(url, max_imgs):
|
24 |
+
max_imgs = max(10, min(max_imgs, 300))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
async with async_playwright() as p:
|
26 |
browser = await p.chromium.launch(headless=True)
|
27 |
context = await browser.new_context(
|
|
|
29 |
viewport={"width": 1366, "height": 768},
|
30 |
)
|
31 |
|
32 |
+
# Cargar cookies desde variable de entorno si está definida
|
33 |
+
cookies_env = os.getenv("COOKIES_JSON")
|
34 |
+
if cookies_env:
|
35 |
+
try:
|
36 |
+
cookies = json.loads(cookies_env)
|
37 |
+
for cookie in cookies:
|
38 |
+
cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize()
|
39 |
+
await context.add_cookies(cookies)
|
40 |
+
print("✅ Cookies cargadas desde variable de entorno")
|
41 |
+
except Exception as e:
|
42 |
+
print(f"⚠️ Error cargando cookies desde variable de entorno: {e}")
|
43 |
|
44 |
page = await context.new_page()
|
45 |
await page.set_extra_http_headers({
|
|
|
48 |
"Referer": "https://www.deviantart.com/",
|
49 |
})
|
50 |
|
|
|
51 |
await page.goto(url, timeout=60000)
|
52 |
await page.wait_for_load_state("networkidle")
|
53 |
|
|
|
68 |
seen_urls.add(item["img_url"])
|
69 |
|
70 |
if len(collected_data) < max_imgs:
|
71 |
+
await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""")
|
72 |
await page.wait_for_timeout(3000)
|
73 |
scroll_attempts += 1
|
74 |
+
try:
|
75 |
+
await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000)
|
76 |
+
except:
|
77 |
+
pass
|
78 |
|
79 |
await browser.close()
|
80 |
+
return collected_data[:max_imgs]
|
81 |
+
|
82 |
+
def run_scraper(url, max_imgs):
|
83 |
+
return asyncio.run(scrape_images(url, int(max_imgs)))
|
84 |
+
|
85 |
+
def interface_fn(url, max_imgs):
|
86 |
+
results = run_scraper(url, max_imgs)
|
87 |
+
images = [(item["img_url"], f"Usuario: {item['user']}") for item in results]
|
88 |
+
return images
|
89 |
+
|
90 |
+
demo = gr.Interface(
|
91 |
+
fn=interface_fn,
|
92 |
+
inputs=[
|
93 |
+
gr.Textbox(label="URL de la galería DeviantArt", lines=1, value="https://www.deviantart.com/silkedead/gallery/68498591/screenshots-film-and-movie"),
|
94 |
+
gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
|
95 |
+
],
|
96 |
+
outputs=gr.Gallery(label="Imágenes recolectadas").style(grid=[3], height="auto"),
|
97 |
+
title="Scraper de Imágenes - DeviantArt",
|
98 |
+
description="Introduce la URL de la galería DeviantArt y la cantidad máxima de imágenes que quieres recolectar."
|
99 |
+
)
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
demo.launch()
|