File size: 1,708 Bytes
101d805 c80b461 101d805 c80b461 101d805 c80b461 101d805 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import duckdb
import yaml
import time
def extract_base_models(tags):
base_models = set() # Use a set to ensure uniqueness
for tag in tags:
if tag.startswith("base_model:"):
base_model = tag.split(":")[-1]
base_model = ''.join(c for c in base_model if c.isalnum() or c in '/_.-')
base_models.add(base_model) # Add to set
return list(base_models) # Convert back to list
# Create a DuckDB connection
con = duckdb.connect()
# Register the Python UDF with explicit return type
con.create_function("extract_base_models", extract_base_models, return_type="VARCHAR[]")
# Query to extract base models using the Python UDF
query = """
SELECT
_id,
id,
extract_base_models(tags) AS base_models
FROM parquet_scan('public/models.parquet')
"""
start_time = time.time()
# Execute the query and create a view
con.execute(f"CREATE VIEW parent_models AS {query}")
# Write the view to a parquet file
con.execute("COPY parent_models TO 'public/parents.parquet' (FORMAT 'parquet')")
end_time = time.time()
execution_time = end_time - start_time
print(f"Query execution time: {execution_time:.2f} seconds")
# Filter rows with non-empty base_models
con.execute("""
CREATE VIEW non_empty_base_models AS
SELECT *
FROM parent_models
WHERE ARRAY_LENGTH(base_models) > 0
""")
# Write a random sample of 10 rows with non-empty base_models to yaml file for inspection
result = con.execute("""
SELECT _id, id, base_models
FROM non_empty_base_models
ORDER BY RANDOM()
LIMIT 10
""").fetchall()
with open("public/parents.example.yaml", "w") as f:
yaml.safe_dump(result, f, default_flow_style=False)
|