Emilianohack6950 commited on
Commit
003f3bf
·
verified ·
1 Parent(s): 8d87719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -43
app.py CHANGED
@@ -9,16 +9,10 @@ subprocess.run(
9
  )
10
 
11
  import asyncio
12
- import gradio as gr
13
- import json
14
  import os
15
- import re
 
16
  from playwright.async_api import async_playwright
17
- from tqdm import tqdm
18
- from dotenv import load_dotenv
19
-
20
- # Cargar .env si existe (solo en local)
21
- load_dotenv()
22
 
23
  USER_AGENT = (
24
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -26,20 +20,8 @@ USER_AGENT = (
26
  "Chrome/91.0.4472.124 Safari/537.36"
27
  )
28
 
29
- # 🔐 Leer cookies desde variable de entorno segura
30
- def cargar_cookies_desde_env():
31
- cookies_str = os.getenv("COOKIES_JSON", "[]")
32
- cookies = json.loads(cookies_str)
33
- for cookie in cookies:
34
- cookie["sameSite"] = cookie.get("sameSite", "None").capitalize()
35
- return cookies
36
-
37
- # 🤖 Detectar si el input es URL o categoría
38
- def es_url(input_str):
39
- return re.match(r"^https?://", input_str.strip()) is not None
40
-
41
- # 🧩 Scraping principal
42
- async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
43
  async with async_playwright() as p:
44
  browser = await p.chromium.launch(headless=True)
45
  context = await browser.new_context(
@@ -47,8 +29,17 @@ async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
47
  viewport={"width": 1366, "height": 768},
48
  )
49
 
50
- if cookies:
51
- await context.add_cookies(cookies)
 
 
 
 
 
 
 
 
 
52
 
53
  page = await context.new_page()
54
  await page.set_extra_http_headers({
@@ -57,7 +48,6 @@ async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
57
  "Referer": "https://www.deviantart.com/",
58
  })
59
 
60
- url = input_value if es_url(input_value) else f"https://www.deviantart.com/search?q={input_value}"
61
  await page.goto(url, timeout=60000)
62
  await page.wait_for_load_state("networkidle")
63
 
@@ -78,25 +68,35 @@ async def scrape_deviantart(input_value, max_imgs=30, cookies=None):
78
  seen_urls.add(item["img_url"])
79
 
80
  if len(collected_data) < max_imgs:
81
- await page.evaluate("window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});")
82
  await page.wait_for_timeout(3000)
83
  scroll_attempts += 1
 
 
 
 
84
 
85
  await browser.close()
86
- return [item["img_url"] for item in collected_data[:max_imgs]]
87
-
88
- # 🎛️ Función que usa Gradio
89
- def interfaz_gradio(input_value, cantidad):
90
- cookies = cargar_cookies_desde_env()
91
- return asyncio.run(scrape_deviantart(input_value, max_imgs=int(cantidad), cookies=cookies))
92
-
93
- # 🎨 Interfaz Gradio
94
- with gr.Blocks() as demo:
95
- gr.Markdown("## 🎨 Buscar Imágenes en DeviantArt")
96
- entrada = gr.Textbox(label="🔍 Categoría o URL personalizada")
97
- cantidad = gr.Slider(5, 100, step=5, value=20, label="📸 Cantidad de imágenes")
98
- btn = gr.Button("Buscar")
99
- galeria = gr.Gallery(label="Resultados")
100
- btn.click(interfaz_gradio, [entrada, cantidad], galeria)
101
-
102
- demo.launch()
 
 
 
 
 
 
 
9
  )
10
 
11
  import asyncio
 
 
12
  import os
13
+ import json
14
+ import gradio as gr
15
  from playwright.async_api import async_playwright
 
 
 
 
 
16
 
17
  USER_AGENT = (
18
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 
20
  "Chrome/91.0.4472.124 Safari/537.36"
21
  )
22
 
23
+ async def scrape_images(url, max_imgs):
24
+ max_imgs = max(10, min(max_imgs, 300))
 
 
 
 
 
 
 
 
 
 
 
 
25
  async with async_playwright() as p:
26
  browser = await p.chromium.launch(headless=True)
27
  context = await browser.new_context(
 
29
  viewport={"width": 1366, "height": 768},
30
  )
31
 
32
+ # Cargar cookies desde variable de entorno si está definida
33
+ cookies_env = os.getenv("COOKIES_JSON")
34
+ if cookies_env:
35
+ try:
36
+ cookies = json.loads(cookies_env)
37
+ for cookie in cookies:
38
+ cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize()
39
+ await context.add_cookies(cookies)
40
+ print("✅ Cookies cargadas desde variable de entorno")
41
+ except Exception as e:
42
+ print(f"⚠️ Error cargando cookies desde variable de entorno: {e}")
43
 
44
  page = await context.new_page()
45
  await page.set_extra_http_headers({
 
48
  "Referer": "https://www.deviantart.com/",
49
  })
50
 
 
51
  await page.goto(url, timeout=60000)
52
  await page.wait_for_load_state("networkidle")
53
 
 
68
  seen_urls.add(item["img_url"])
69
 
70
  if len(collected_data) < max_imgs:
71
+ await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""")
72
  await page.wait_for_timeout(3000)
73
  scroll_attempts += 1
74
+ try:
75
+ await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000)
76
+ except:
77
+ pass
78
 
79
  await browser.close()
80
+ return collected_data[:max_imgs]
81
+
82
+ def run_scraper(url, max_imgs):
83
+ return asyncio.run(scrape_images(url, int(max_imgs)))
84
+
85
+ def interface_fn(url, max_imgs):
86
+ results = run_scraper(url, max_imgs)
87
+ images = [(item["img_url"], f"Usuario: {item['user']}") for item in results]
88
+ return images
89
+
90
+ demo = gr.Interface(
91
+ fn=interface_fn,
92
+ inputs=[
93
+ gr.Textbox(label="URL de la galería DeviantArt", lines=1, value="https://www.deviantart.com/silkedead/gallery/68498591/screenshots-film-and-movie"),
94
+ gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
95
+ ],
96
+ outputs=gr.Gallery(label="Imágenes recolectadas").style(grid=[3], height="auto"),
97
+ title="Scraper de Imágenes - DeviantArt",
98
+ description="Introduce la URL de la galería DeviantArt y la cantidad máxima de imágenes que quieres recolectar."
99
+ )
100
+
101
+ if __name__ == "__main__":
102
+ demo.launch()