hf-public-data-insights / python /0_download_files.py
littlebird13's picture
Update python/0_download_files.py
f689fb8 verified
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import duckdb
import random
import argparse
import yaml
# Create the "public" folders if they don't exist
os.makedirs("public", exist_ok=True)
# URLs of the files to download
urls = [
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet?download=true",
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/datasets.parquet?download=true",
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/spaces.parquet?download=true"
]
def download_file(url, overwrite=True):
filename = os.path.join("public", url.split("/")[-1].split("?")[0])
if not overwrite and os.path.exists(filename):
print(f"File already exists: {filename}. Skipping download.")
return
response = requests.get(url, stream=True)
total_size = int(response.headers.get("Content-Length", 0))
block_size = 1024 # 1 KB
with open(filename, "wb") as file, tqdm(
desc=filename,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
) as progress_bar:
for data in response.iter_content(block_size):
size = file.write(data)
progress_bar.update(size)
print(f"Downloaded: {filename}")
def main(overwrite):
# Create a ThreadPoolExecutor with max_workers set to 3 (number of files to download)
with ThreadPoolExecutor(max_workers=3) as executor:
# Submit download tasks to the executor
futures = [executor.submit(download_file, url, overwrite) for url in urls]
# Wait for all tasks to complete
for future in futures:
future.result()
print("All files downloaded successfully.")
# Process each downloaded Parquet file
for url in urls:
filename = os.path.join("public", url.split("/")[-1].split("?")[0])
table_name = os.path.splitext(os.path.basename(filename))[0]
# Connect to the Parquet file using DuckDB
con = duckdb.connect(database=':memory:')
con.execute(f"CREATE VIEW {table_name} AS SELECT * FROM parquet_scan('{filename}')")
# Retrieve the table structure
table_structure = con.execute(f"DESCRIBE {table_name}").fetchall()
# Generate the YAML content
yaml_content = f"{table_name}:\n"
yaml_content += " table_structure:\n"
for row in table_structure:
column, dtype = row[:2] # Unpack only the first two values
yaml_content += f" - column: {column}\n"
yaml_content += f" type: {dtype}\n"
# Retrieve 10 random items from the table
con.execute(f"CREATE VIEW {table_name}_random AS SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 10")
random_items = con.execute(f"SELECT * FROM {table_name}_random").fetchall()
yaml_content += " random_items:\n"
for item in random_items:
yaml_content += " - "
for column, value in zip([row[0] for row in table_structure], item):
yaml_content += f"{column}: {value}\n "
yaml_content = yaml_content.rstrip() # Remove trailing spaces
yaml_content += "\n"
# Save the YAML content to a file in the "public" folder
yaml_file = os.path.join("public", f"{table_name}.example.yaml")
with open(yaml_file, "w") as file:
file.write(yaml_content)
print(f"Generated: {yaml_file}")
print("Example files generated successfully.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download and process Parquet files.")
parser.add_argument("--no-overwrite", action="store_true", help="Skip downloading files that already exist.")
args = parser.parse_args()
main(overwrite=not args.no_overwrite)