hf-public-data-insights

Running

App Files Files Community

hf-public-data-insights / python /0_download_files.py

littlebird13

Update python/0_download_files.py

f689fb8 verified 7 days ago

raw

history blame contribute delete

3.99 kB

	import os
	import requests
	from concurrent.futures import ThreadPoolExecutor
	from tqdm import tqdm
	import duckdb
	import random
	import argparse
	import yaml

	# Create the "public" folders if they don't exist
	os.makedirs("public", exist_ok=True)

	# URLs of the files to download
	urls = [
	"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet?download=true",
	"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/datasets.parquet?download=true",
	"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/spaces.parquet?download=true"
	]

	def download_file(url, overwrite=True):
	filename = os.path.join("public", url.split("/")[-1].split("?")[0])

	if not overwrite and os.path.exists(filename):
	print(f"File already exists: {filename}. Skipping download.")
	return

	response = requests.get(url, stream=True)
	total_size = int(response.headers.get("Content-Length", 0))
	block_size = 1024 # 1 KB

	with open(filename, "wb") as file, tqdm(
	desc=filename,
	total=total_size,
	unit="iB",
	unit_scale=True,
	unit_divisor=1024,
	bar_format="{l_bar}{bar}\| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
	) as progress_bar:
	for data in response.iter_content(block_size):
	size = file.write(data)
	progress_bar.update(size)

	print(f"Downloaded: {filename}")

	def main(overwrite):
	# Create a ThreadPoolExecutor with max_workers set to 3 (number of files to download)
	with ThreadPoolExecutor(max_workers=3) as executor:
	# Submit download tasks to the executor
	futures = [executor.submit(download_file, url, overwrite) for url in urls]

	# Wait for all tasks to complete
	for future in futures:
	future.result()

	print("All files downloaded successfully.")

	# Process each downloaded Parquet file
	for url in urls:
	filename = os.path.join("public", url.split("/")[-1].split("?")[0])
	table_name = os.path.splitext(os.path.basename(filename))[0]

	# Connect to the Parquet file using DuckDB
	con = duckdb.connect(database=':memory:')
	con.execute(f"CREATE VIEW {table_name} AS SELECT * FROM parquet_scan('{filename}')")

	# Retrieve the table structure
	table_structure = con.execute(f"DESCRIBE {table_name}").fetchall()

	# Generate the YAML content
	yaml_content = f"{table_name}:\n"
	yaml_content += " table_structure:\n"
	for row in table_structure:
	column, dtype = row[:2] # Unpack only the first two values
	yaml_content += f" - column: {column}\n"
	yaml_content += f" type: {dtype}\n"

	# Retrieve 10 random items from the table
	con.execute(f"CREATE VIEW {table_name}_random AS SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 10")
	random_items = con.execute(f"SELECT * FROM {table_name}_random").fetchall()

	yaml_content += " random_items:\n"
	for item in random_items:
	yaml_content += " - "
	for column, value in zip([row[0] for row in table_structure], item):
	yaml_content += f"{column}: {value}\n "
	yaml_content = yaml_content.rstrip() # Remove trailing spaces
	yaml_content += "\n"

	# Save the YAML content to a file in the "public" folder
	yaml_file = os.path.join("public", f"{table_name}.example.yaml")
	with open(yaml_file, "w") as file:
	file.write(yaml_content)

	print(f"Generated: {yaml_file}")

	print("Example files generated successfully.")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Download and process Parquet files.")
	parser.add_argument("--no-overwrite", action="store_true", help="Skip downloading files that already exist.")
	args = parser.parse_args()

	main(overwrite=not args.no_overwrite)