import os import requests from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm import duckdb import random import argparse import yaml # Create the "public" folders if they don't exist os.makedirs("public", exist_ok=True) # URLs of the files to download urls = [ "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet?download=true", "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/datasets.parquet?download=true", "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/spaces.parquet?download=true" ] def download_file(url, overwrite=True): filename = os.path.join("public", url.split("/")[-1].split("?")[0]) if not overwrite and os.path.exists(filename): print(f"File already exists: {filename}. Skipping download.") return response = requests.get(url, stream=True) total_size = int(response.headers.get("Content-Length", 0)) block_size = 1024 # 1 KB with open(filename, "wb") as file, tqdm( desc=filename, total=total_size, unit="iB", unit_scale=True, unit_divisor=1024, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]" ) as progress_bar: for data in response.iter_content(block_size): size = file.write(data) progress_bar.update(size) print(f"Downloaded: {filename}") def main(overwrite): # Create a ThreadPoolExecutor with max_workers set to 3 (number of files to download) with ThreadPoolExecutor(max_workers=3) as executor: # Submit download tasks to the executor futures = [executor.submit(download_file, url, overwrite) for url in urls] # Wait for all tasks to complete for future in futures: future.result() print("All files downloaded successfully.") # Process each downloaded Parquet file for url in urls: filename = os.path.join("public", url.split("/")[-1].split("?")[0]) table_name = os.path.splitext(os.path.basename(filename))[0] # Connect to the Parquet file using DuckDB con = duckdb.connect(database=':memory:') con.execute(f"CREATE VIEW {table_name} AS SELECT * FROM parquet_scan('{filename}')") # Retrieve the table structure table_structure = con.execute(f"DESCRIBE {table_name}").fetchall() # Generate the YAML content yaml_content = f"{table_name}:\n" yaml_content += " table_structure:\n" for row in table_structure: column, dtype = row[:2] # Unpack only the first two values yaml_content += f" - column: {column}\n" yaml_content += f" type: {dtype}\n" # Retrieve 10 random items from the table con.execute(f"CREATE VIEW {table_name}_random AS SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 10") random_items = con.execute(f"SELECT * FROM {table_name}_random").fetchall() yaml_content += " random_items:\n" for item in random_items: yaml_content += " - " for column, value in zip([row[0] for row in table_structure], item): yaml_content += f"{column}: {value}\n " yaml_content = yaml_content.rstrip() # Remove trailing spaces yaml_content += "\n" # Save the YAML content to a file in the "public" folder yaml_file = os.path.join("public", f"{table_name}.example.yaml") with open(yaml_file, "w") as file: file.write(yaml_content) print(f"Generated: {yaml_file}") print("Example files generated successfully.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Download and process Parquet files.") parser.add_argument("--no-overwrite", action="store_true", help="Skip downloading files that already exist.") args = parser.parse_args() main(overwrite=not args.no_overwrite)