Spaces:
Sleeping
Sleeping
from __future__ import annotations | |
from typing import List, TypedDict | |
import huggingface_hub | |
from huggingface_hub.hf_api import SpaceInfo | |
from concurrent.futures import ThreadPoolExecutor | |
import os | |
import json | |
import datetime | |
import tqdm | |
import requests | |
from pathlib import Path | |
from screenshot import get_screen_shot | |
import boto3 | |
from threading import Lock | |
class SpaceData(TypedDict): | |
id: str | |
likes: int | |
subdomain: str | |
lastModified: str | |
status: str | |
repo = huggingface_hub.Repository( | |
local_dir="data", | |
repo_type="dataset", | |
clone_from="freddyaboulton/gradio-theme-subdomains", | |
token=os.getenv("HF_TOKEN"), | |
) | |
repo.git_pull() | |
prev_data = {s['id']: s for s in json.load(open("data/val_subdomains.json"))} | |
screen_shot_dir = Path("data") / "images" | |
screen_shot_dir.mkdir(exist_ok=True, parents=True) | |
s3_client = boto3.client( | |
"s3", | |
aws_access_key_id=os.getenv("AWS_ACCESS_KEY"), | |
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), | |
) | |
lock = Lock() | |
api = huggingface_hub.HfApi(token=os.getenv("HF_TOKEN")) | |
def get_theme_preview_spaces() -> List[SpaceInfo]: | |
return list(iter(api.list_spaces(filter="gradio-theme"))) | |
def get_info(space_name: SpaceInfo) -> SpaceData | None: | |
if not space_name.id: | |
print(f"no space_name for {space_name}") | |
return None | |
space_info = api.space_info(space_name.id, token=os.getenv("HF_TOKEN")) | |
if space_info.private: | |
print(f"{space_name} is private") | |
return None | |
subdomain: str | None = getattr(space_info, "subdomain", None) | |
if subdomain is None: | |
print(f"no subdomain for {space_info.id}") | |
return None | |
status = space_info.runtime.stage | |
img_id = space_info.id.replace("/", "_") | |
light_file = str(screen_shot_dir / Path(img_id + "_light.jpg")) | |
dark_file = str(screen_shot_dir / Path(img_id + "_dark.jpg")) | |
if False: #status == "RUNNING": | |
if not prev_data.get(space_info.id, {}).get("sha") or (prev_data.get(space_info.id, {}).get("sha") != space_info.sha): | |
prev_data[space_info.id]['sha'] = space_info.sha | |
with lock: | |
get_screen_shot( | |
f"https://{space_info.subdomain}.hf.space?__theme=light", 3, light_file | |
) | |
with lock: | |
get_screen_shot( | |
f"https://{space_info.subdomain}.hf.space?__theme=dark", 3, dark_file | |
) | |
s3_client.upload_file( | |
light_file, | |
"gradio-theme-screenshots", | |
img_id + "_light.jpg", | |
ExtraArgs={"ContentType": "image/jpg"}, | |
) | |
s3_client.upload_file( | |
dark_file, | |
"gradio-theme-screenshots", | |
img_id + "_dark.jpg", | |
ExtraArgs={"ContentType": "image/jpg"}, | |
) | |
if status not in ["SLEEPING", "RUNNING", "RUNNING_BUILDING", "BUILDING"]: | |
print(f"Space not running, building, or sleeping {space_info.id}") | |
elif status == "SLEEPING": | |
requests.get(f"https://huggingface.co/spaces/{space_info.id}") | |
return { | |
"id": space_info.id, | |
"likes": space_info.likes, | |
"sha": space_info.sha, | |
"lastModified": space_info.lastModified.strftime("%Y-%m-%d"), | |
"screenshot_id": img_id, | |
"status": status, | |
"subdomain": f"https://{space_info.subdomain}.hf.space/" | |
} # type: ignore | |
def get_all_info(spaces: List[SpaceInfo]) -> List[SpaceData]: | |
with ThreadPoolExecutor(max_workers=10) as executor: | |
all_info = list(tqdm.tqdm(executor.map(get_info, spaces), total=len(spaces))) | |
return [info for info in all_info if info] | |
def process_spaces(): | |
theme_spaces = list(iter(get_theme_preview_spaces())) | |
all_info = get_all_info(theme_spaces) | |
json.dump(all_info, open("data/subdomains.json", "w")) | |
repo.push_to_hub( | |
blocking=False, commit_message=f"Updating data at {datetime.datetime.now()}" | |
) | |