hf-public-data-insights

Running

File size: 1,708 Bytes

import duckdb
import yaml
import time

def extract_base_models(tags):
    base_models = set()  # Use a set to ensure uniqueness
    for tag in tags:
        if tag.startswith("base_model:"):
            base_model = tag.split(":")[-1]
            base_model = ''.join(c for c in base_model if c.isalnum() or c in '/_.-')
            base_models.add(base_model)  # Add to set
    return list(base_models)  # Convert back to list

# Create a DuckDB connection
con = duckdb.connect()

# Register the Python UDF with explicit return type
con.create_function("extract_base_models", extract_base_models, return_type="VARCHAR[]")

# Query to extract base models using the Python UDF
query = """
    SELECT 
        _id, 
        id, 
        extract_base_models(tags) AS base_models
    FROM parquet_scan('public/models.parquet')
"""

start_time = time.time()

# Execute the query and create a view
con.execute(f"CREATE VIEW parent_models AS {query}")

# Write the view to a parquet file
con.execute("COPY parent_models TO 'public/parents.parquet' (FORMAT 'parquet')")

end_time = time.time()
execution_time = end_time - start_time

print(f"Query execution time: {execution_time:.2f} seconds")

# Filter rows with non-empty base_models
con.execute("""
    CREATE VIEW non_empty_base_models AS
    SELECT * 
    FROM parent_models
    WHERE ARRAY_LENGTH(base_models) > 0
""")

# Write a random sample of 10 rows with non-empty base_models to yaml file for inspection
result = con.execute("""
    SELECT _id, id, base_models 
    FROM non_empty_base_models
    ORDER BY RANDOM()
    LIMIT 10
""").fetchall()

with open("public/parents.example.yaml", "w") as f:
    yaml.safe_dump(result, f, default_flow_style=False)