hf-public-data-insights

Running

hf-public-data-insights / python /1_parents.py

Xianbao QIAN

use client side rendering for the homepage.

c80b461 28 days ago

1.71 kB

	import duckdb
	import yaml
	import time

	def extract_base_models(tags):
	base_models = set() # Use a set to ensure uniqueness
	for tag in tags:
	if tag.startswith("base_model:"):
	base_model = tag.split(":")[-1]
	base_model = ''.join(c for c in base_model if c.isalnum() or c in '/_.-')
	base_models.add(base_model) # Add to set
	return list(base_models) # Convert back to list

	# Create a DuckDB connection
	con = duckdb.connect()

	# Register the Python UDF with explicit return type
	con.create_function("extract_base_models", extract_base_models, return_type="VARCHAR[]")

	# Query to extract base models using the Python UDF
	query = """
	SELECT
	_id,
	id,
	extract_base_models(tags) AS base_models
	FROM parquet_scan('public/models.parquet')
	"""

	start_time = time.time()

	# Execute the query and create a view
	con.execute(f"CREATE VIEW parent_models AS {query}")

	# Write the view to a parquet file
	con.execute("COPY parent_models TO 'public/parents.parquet' (FORMAT 'parquet')")

	end_time = time.time()
	execution_time = end_time - start_time

	print(f"Query execution time: {execution_time:.2f} seconds")

	# Filter rows with non-empty base_models
	con.execute("""
	CREATE VIEW non_empty_base_models AS
	SELECT *
	FROM parent_models
	WHERE ARRAY_LENGTH(base_models) > 0
	""")

	# Write a random sample of 10 rows with non-empty base_models to yaml file for inspection
	result = con.execute("""
	SELECT _id, id, base_models
	FROM non_empty_base_models
	ORDER BY RANDOM()
	LIMIT 10
	""").fetchall()

	with open("public/parents.example.yaml", "w") as f:
	yaml.safe_dump(result, f, default_flow_style=False)