Emilianohack6950's picture
Update app.py
e5fc626 verified
import subprocess
subprocess.run(["playwright", "install"])
subprocess.run(["playwright", "install-deps"])
subprocess.run(
"apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1",
shell=True,
check=True,
)
import os
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import json
import nest_asyncio
import gradio as gr
nest_asyncio.apply()
async def scrapear_pinterest_async(categoria: str, cantidad: int = 50):
urls = set()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
# Leer cookies desde variable de entorno (secret)
cookies_str = os.getenv("COOKIES_JSON")
if not cookies_str:
raise ValueError("La variable de entorno COOKIES_JSON no está configurada")
cookies = json.loads(cookies_str)
for cookie in cookies:
if 'sameSite' in cookie:
if cookie['sameSite'] not in ['Strict', 'Lax', 'None']:
cookie['sameSite'] = 'Lax'
await context.add_cookies(cookies)
page = await context.new_page()
search_url = f"https://mx.pinterest.com/search/pins/?q={categoria.replace(' ', '%20')}"
await page.goto(search_url, timeout=60000)
await asyncio.sleep(3)
max_intentos_sin_cambios = 3
intentos_sin_cambios = 0
cantidad_anterior = 0
while len(urls) < cantidad and intentos_sin_cambios < max_intentos_sin_cambios:
await page.mouse.wheel(0, 5000)
await asyncio.sleep(2)
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
images = soup.find_all("img", {"srcset": True})
for img in images:
srcset = img.get("srcset")
candidates = [u.strip().split(" ")[0] for u in srcset.split(",")]
if candidates:
url_grande = candidates[-1]
if "736x" in url_grande or "originals" in url_grande:
if url_grande not in urls:
urls.add(url_grande)
print(f"Imágenes encontradas: {len(urls)}")
if len(urls) >= cantidad:
break
# Verificar si se encontraron nuevas imágenes
if len(urls) == cantidad_anterior:
intentos_sin_cambios += 1
print(f"No se encontraron nuevas imágenes. Intento {intentos_sin_cambios}/{max_intentos_sin_cambios}")
else:
intentos_sin_cambios = 0
cantidad_anterior = len(urls)
await browser.close()
return list(urls)
def scrapear_pinterest(categoria: str, cantidad: int):
urls = asyncio.run(scrapear_pinterest_async(categoria, cantidad))
if len(urls) < cantidad:
print(f"⚠️ Solo se encontraron {len(urls)} imágenes de las {cantidad} solicitadas.")
resultados = [{"img_url": url} for url in urls]
return json.dumps(resultados, indent=2, ensure_ascii=False)
iface = gr.Interface(
fn=scrapear_pinterest,
inputs=[
gr.Textbox(label="Categoría a buscar", value="Escaramuza"),
gr.Slider(minimum=1, maximum=500, step=1, label="Cantidad de imágenes", value=30),
],
outputs=gr.JSON(label="URLs de imágenes"),
title="Scraper Pinterest con Playwright y Gradio",
description="Ingresa la categoría y cantidad de imágenes a obtener de Pinterest. Devuelve una lista de URLs en formato JSON."
)
if __name__ == "__main__":
iface.launch()