theme-gallery / processing.py
freddyaboulton's picture
Update processing.py
91400b2 verified
from __future__ import annotations
from typing import List, TypedDict
import huggingface_hub
from huggingface_hub.hf_api import SpaceInfo
from concurrent.futures import ThreadPoolExecutor
import os
import json
import datetime
import tqdm
import requests
from pathlib import Path
from screenshot import get_screen_shot
import boto3
from threading import Lock
class SpaceData(TypedDict):
id: str
likes: int
subdomain: str
lastModified: str
status: str
repo = huggingface_hub.Repository(
local_dir="data",
repo_type="dataset",
clone_from="freddyaboulton/gradio-theme-subdomains",
token=os.getenv("HF_TOKEN"),
)
repo.git_pull()
prev_data = {s['id']: s for s in json.load(open("data/val_subdomains.json"))}
screen_shot_dir = Path("data") / "images"
screen_shot_dir.mkdir(exist_ok=True, parents=True)
s3_client = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
)
lock = Lock()
api = huggingface_hub.HfApi(token=os.getenv("HF_TOKEN"))
def get_theme_preview_spaces() -> List[SpaceInfo]:
return list(iter(api.list_spaces(filter="gradio-theme")))
def get_info(space_name: SpaceInfo) -> SpaceData | None:
if not space_name.id:
print(f"no space_name for {space_name}")
return None
space_info = api.space_info(space_name.id, token=os.getenv("HF_TOKEN"))
if space_info.private:
print(f"{space_name} is private")
return None
subdomain: str | None = getattr(space_info, "subdomain", None)
if subdomain is None:
print(f"no subdomain for {space_info.id}")
return None
status = space_info.runtime.stage
img_id = space_info.id.replace("/", "_")
light_file = str(screen_shot_dir / Path(img_id + "_light.jpg"))
dark_file = str(screen_shot_dir / Path(img_id + "_dark.jpg"))
if False: #status == "RUNNING":
if not prev_data.get(space_info.id, {}).get("sha") or (prev_data.get(space_info.id, {}).get("sha") != space_info.sha):
prev_data[space_info.id]['sha'] = space_info.sha
with lock:
get_screen_shot(
f"https://{space_info.subdomain}.hf.space?__theme=light", 3, light_file
)
with lock:
get_screen_shot(
f"https://{space_info.subdomain}.hf.space?__theme=dark", 3, dark_file
)
s3_client.upload_file(
light_file,
"gradio-theme-screenshots",
img_id + "_light.jpg",
ExtraArgs={"ContentType": "image/jpg"},
)
s3_client.upload_file(
dark_file,
"gradio-theme-screenshots",
img_id + "_dark.jpg",
ExtraArgs={"ContentType": "image/jpg"},
)
if status not in ["SLEEPING", "RUNNING", "RUNNING_BUILDING", "BUILDING"]:
print(f"Space not running, building, or sleeping {space_info.id}")
elif status == "SLEEPING":
requests.get(f"https://huggingface.co/spaces/{space_info.id}")
return {
"id": space_info.id,
"likes": space_info.likes,
"sha": space_info.sha,
"lastModified": space_info.lastModified.strftime("%Y-%m-%d"),
"screenshot_id": img_id,
"status": status,
"subdomain": f"https://{space_info.subdomain}.hf.space/"
} # type: ignore
def get_all_info(spaces: List[SpaceInfo]) -> List[SpaceData]:
with ThreadPoolExecutor(max_workers=10) as executor:
all_info = list(tqdm.tqdm(executor.map(get_info, spaces), total=len(spaces)))
return [info for info in all_info if info]
def process_spaces():
theme_spaces = list(iter(get_theme_preview_spaces()))
all_info = get_all_info(theme_spaces)
json.dump(all_info, open("data/subdomains.json", "w"))
repo.push_to_hub(
blocking=False, commit_message=f"Updating data at {datetime.datetime.now()}"
)