|
import os |
|
import requests |
|
from concurrent.futures import ThreadPoolExecutor |
|
from tqdm import tqdm |
|
import duckdb |
|
import random |
|
import argparse |
|
import yaml |
|
|
|
|
|
os.makedirs("public", exist_ok=True) |
|
|
|
|
|
urls = [ |
|
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet?download=true", |
|
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/datasets.parquet?download=true", |
|
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/spaces.parquet?download=true" |
|
] |
|
|
|
def download_file(url, overwrite=True): |
|
filename = os.path.join("public", url.split("/")[-1].split("?")[0]) |
|
|
|
if not overwrite and os.path.exists(filename): |
|
print(f"File already exists: {filename}. Skipping download.") |
|
return |
|
|
|
response = requests.get(url, stream=True) |
|
total_size = int(response.headers.get("Content-Length", 0)) |
|
block_size = 1024 |
|
|
|
with open(filename, "wb") as file, tqdm( |
|
desc=filename, |
|
total=total_size, |
|
unit="iB", |
|
unit_scale=True, |
|
unit_divisor=1024, |
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]" |
|
) as progress_bar: |
|
for data in response.iter_content(block_size): |
|
size = file.write(data) |
|
progress_bar.update(size) |
|
|
|
print(f"Downloaded: {filename}") |
|
|
|
def main(overwrite): |
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor: |
|
|
|
futures = [executor.submit(download_file, url, overwrite) for url in urls] |
|
|
|
|
|
for future in futures: |
|
future.result() |
|
|
|
print("All files downloaded successfully.") |
|
|
|
|
|
for url in urls: |
|
filename = os.path.join("public", url.split("/")[-1].split("?")[0]) |
|
table_name = os.path.splitext(os.path.basename(filename))[0] |
|
|
|
|
|
con = duckdb.connect(database=':memory:') |
|
con.execute(f"CREATE VIEW {table_name} AS SELECT * FROM parquet_scan('{filename}')") |
|
|
|
|
|
table_structure = con.execute(f"DESCRIBE {table_name}").fetchall() |
|
|
|
|
|
yaml_content = f"{table_name}:\n" |
|
yaml_content += " table_structure:\n" |
|
for row in table_structure: |
|
column, dtype = row[:2] |
|
yaml_content += f" - column: {column}\n" |
|
yaml_content += f" type: {dtype}\n" |
|
|
|
|
|
con.execute(f"CREATE VIEW {table_name}_random AS SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 10") |
|
random_items = con.execute(f"SELECT * FROM {table_name}_random").fetchall() |
|
|
|
yaml_content += " random_items:\n" |
|
for item in random_items: |
|
yaml_content += " - " |
|
for column, value in zip([row[0] for row in table_structure], item): |
|
yaml_content += f"{column}: {value}\n " |
|
yaml_content = yaml_content.rstrip() |
|
yaml_content += "\n" |
|
|
|
|
|
yaml_file = os.path.join("public", f"{table_name}.example.yaml") |
|
with open(yaml_file, "w") as file: |
|
file.write(yaml_content) |
|
|
|
print(f"Generated: {yaml_file}") |
|
|
|
print("Example files generated successfully.") |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Download and process Parquet files.") |
|
parser.add_argument("--no-overwrite", action="store_true", help="Skip downloading files that already exist.") |
|
args = parser.parse_args() |
|
|
|
main(overwrite=not args.no_overwrite) |
|
|