|
import subprocess |
|
|
|
subprocess.run(["playwright", "install"]) |
|
subprocess.run(["playwright", "install-deps"]) |
|
subprocess.run( |
|
"apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1", |
|
shell=True, |
|
check=True, |
|
) |
|
|
|
import os |
|
import asyncio |
|
from playwright.async_api import async_playwright |
|
from bs4 import BeautifulSoup |
|
import json |
|
import nest_asyncio |
|
import gradio as gr |
|
|
|
nest_asyncio.apply() |
|
|
|
async def scrapear_pinterest_async(categoria: str, cantidad: int = 50): |
|
urls = set() |
|
|
|
async with async_playwright() as p: |
|
browser = await p.chromium.launch(headless=True) |
|
context = await browser.new_context() |
|
|
|
|
|
cookies_str = os.getenv("COOKIES_JSON") |
|
if not cookies_str: |
|
raise ValueError("La variable de entorno COOKIES_JSON no está configurada") |
|
cookies = json.loads(cookies_str) |
|
|
|
for cookie in cookies: |
|
if 'sameSite' in cookie: |
|
if cookie['sameSite'] not in ['Strict', 'Lax', 'None']: |
|
cookie['sameSite'] = 'Lax' |
|
|
|
await context.add_cookies(cookies) |
|
page = await context.new_page() |
|
|
|
search_url = f"https://mx.pinterest.com/search/pins/?q={categoria.replace(' ', '%20')}" |
|
await page.goto(search_url, timeout=60000) |
|
await asyncio.sleep(3) |
|
|
|
max_intentos_sin_cambios = 3 |
|
intentos_sin_cambios = 0 |
|
cantidad_anterior = 0 |
|
|
|
while len(urls) < cantidad and intentos_sin_cambios < max_intentos_sin_cambios: |
|
await page.mouse.wheel(0, 5000) |
|
await asyncio.sleep(2) |
|
|
|
content = await page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
images = soup.find_all("img", {"srcset": True}) |
|
|
|
for img in images: |
|
srcset = img.get("srcset") |
|
candidates = [u.strip().split(" ")[0] for u in srcset.split(",")] |
|
if candidates: |
|
url_grande = candidates[-1] |
|
if "736x" in url_grande or "originals" in url_grande: |
|
if url_grande not in urls: |
|
urls.add(url_grande) |
|
print(f"Imágenes encontradas: {len(urls)}") |
|
if len(urls) >= cantidad: |
|
break |
|
|
|
|
|
if len(urls) == cantidad_anterior: |
|
intentos_sin_cambios += 1 |
|
print(f"No se encontraron nuevas imágenes. Intento {intentos_sin_cambios}/{max_intentos_sin_cambios}") |
|
else: |
|
intentos_sin_cambios = 0 |
|
cantidad_anterior = len(urls) |
|
|
|
await browser.close() |
|
return list(urls) |
|
|
|
def scrapear_pinterest(categoria: str, cantidad: int): |
|
urls = asyncio.run(scrapear_pinterest_async(categoria, cantidad)) |
|
if len(urls) < cantidad: |
|
print(f"⚠️ Solo se encontraron {len(urls)} imágenes de las {cantidad} solicitadas.") |
|
resultados = [{"img_url": url} for url in urls] |
|
return json.dumps(resultados, indent=2, ensure_ascii=False) |
|
|
|
iface = gr.Interface( |
|
fn=scrapear_pinterest, |
|
inputs=[ |
|
gr.Textbox(label="Categoría a buscar", value="Escaramuza"), |
|
gr.Slider(minimum=1, maximum=500, step=1, label="Cantidad de imágenes", value=30), |
|
], |
|
outputs=gr.JSON(label="URLs de imágenes"), |
|
title="Scraper Pinterest con Playwright y Gradio", |
|
description="Ingresa la categoría y cantidad de imágenes a obtener de Pinterest. Devuelve una lista de URLs en formato JSON." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|