Emilianohack6950 commited on
Commit
acb9a1b
·
verified ·
1 Parent(s): b068cc7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ subprocess.run(["playwright", "install"])
4
+ subprocess.run(["playwright", "install-deps"])
5
+ subprocess.run(
6
+ "apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1",
7
+ shell=True,
8
+ check=True,
9
+ )
10
+ import asyncio
11
+ import os
12
+ import json
13
+ import gradio as gr
14
+ from playwright.async_api import async_playwright
15
+ from urllib.parse import quote_plus
16
+
17
+ USER_AGENT = (
18
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
19
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
20
+ "Chrome/91.0.4472.124 Safari/537.36"
21
+ )
22
+
23
+ def build_url(input_str):
24
+ input_str = input_str.strip()
25
+ if input_str.startswith("http://") or input_str.startswith("https://"):
26
+ return input_str
27
+ else:
28
+ categoria = quote_plus(input_str)
29
+ return f"https://www.deviantart.com/search?q={categoria}"
30
+
31
+ async def scrape_images(url, max_imgs):
32
+ max_imgs = max(10, min(max_imgs, 300))
33
+ async with async_playwright() as p:
34
+ browser = await p.chromium.launch(headless=True)
35
+ context = await browser.new_context(
36
+ user_agent=USER_AGENT,
37
+ viewport={"width": 1366, "height": 768},
38
+ )
39
+
40
+ cookies_env = os.getenv("COOKIES_JSON")
41
+ if cookies_env:
42
+ try:
43
+ cookies = json.loads(cookies_env)
44
+ for cookie in cookies:
45
+ cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize()
46
+ await context.add_cookies(cookies)
47
+ print("✅ Cookies cargadas desde variable de entorno")
48
+ except Exception as e:
49
+ print(f"⚠️ Error cargando cookies desde variable de entorno: {e}")
50
+
51
+ page = await context.new_page()
52
+ await page.set_extra_http_headers({
53
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
54
+ "Accept-Language": "en-US,en;q=0.5",
55
+ "Referer": "https://www.deviantart.com/",
56
+ })
57
+
58
+ await page.goto(url, timeout=60000)
59
+ await page.wait_for_load_state("networkidle")
60
+
61
+ collected_data = []
62
+ seen_urls = set()
63
+ scroll_attempts = 0
64
+ max_scroll_attempts = 30
65
+
66
+ while len(collected_data) < max_imgs and scroll_attempts < max_scroll_attempts:
67
+ new_items = await page.evaluate("""() => Array.from(document.querySelectorAll('img[srcset]')).map(img => ({
68
+ img_url: img.srcset.split(', ').pop().split(' ')[0],
69
+ user: img.alt || "Desconocido"
70
+ }))""")
71
+
72
+ for item in new_items:
73
+ if item["img_url"] not in seen_urls:
74
+ collected_data.append(item)
75
+ seen_urls.add(item["img_url"])
76
+
77
+ if len(collected_data) < max_imgs:
78
+ await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""")
79
+ await page.wait_for_timeout(3000)
80
+ scroll_attempts += 1
81
+ try:
82
+ await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000)
83
+ except:
84
+ pass
85
+
86
+ await browser.close()
87
+ return collected_data[:max_imgs]
88
+
89
+ def run_scraper(user_input, max_imgs):
90
+ url = build_url(user_input)
91
+ print(f"Usando URL: {url}")
92
+ return asyncio.run(scrape_images(url, int(max_imgs)))
93
+
94
+ def interface_fn(user_input, max_imgs):
95
+ results = run_scraper(user_input, max_imgs)
96
+ images = [(item["img_url"], f"Usuario: {item['user']}") for item in results]
97
+ return images
98
+
99
+ demo = gr.Interface(
100
+ fn=interface_fn,
101
+ inputs=[
102
+ gr.Textbox(label="URL o Categoría DeviantArt", lines=1,
103
+ placeholder="Pega una URL o escribe una categoría o usuario"),
104
+ gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
105
+ ],
106
+ outputs=gr.Gallery(label="Imágenes recolectadas").style(grid=[3], height="auto"),
107
+ title="Scraper de Imágenes - DeviantArt",
108
+ description="Introduce una URL completa o solo una categoría/usuario para buscar imágenes."
109
+ )
110
+
111
+ if __name__ == "__main__":
112
+ demo.launch()