File size: 6,955 Bytes
101d805 c80b461 101d805 913402a 101d805 913402a c80b461 101d805 c80b461 101d805 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import duckdb
import yaml
import time
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Create a DuckDB connection
con = duckdb.connect()
def execute_with_timing(query, description):
"""Execute a DuckDB query and log the execution time."""
start_time = time.perf_counter() # Use perf_counter for higher resolution timing
con.execute(query)
end_time = time.perf_counter() # End timing after the query execution
logging.info(f"Completed {description} in {end_time - start_time:.6f} seconds.")
# Start timing the total execution
total_start_time = time.perf_counter()
# Load parents.parquet into an in-memory table
load_parents_query = """
CREATE TABLE parents_in_memory AS
SELECT * FROM parquet_scan('public/parents.parquet')
"""
execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM")
# Step 1: Assign a unique numerical ID to each model ID
unique_id_query = """
CREATE TABLE unique_ids AS
SELECT
id,
ROW_NUMBER() OVER () AS tmp_id
FROM parents_in_memory
"""
execute_with_timing(unique_id_query, "Step 1: Created unique_ids table")
# Step 2: Unnest base_models and create a temporary table
unnest_query = """
CREATE TABLE unnested_models AS
SELECT
u.tmp_id AS child_tmp_id, -- Numerical ID for the child model
UNNEST(p.base_models) AS base_model
FROM parents_in_memory p
JOIN unique_ids u ON p.id = u.id
WHERE p.base_models IS NOT NULL -- Filter out models without base models
"""
execute_with_timing(unnest_query, "Step 2: Created unnested_models table")
# Step 3: Create a temporary table for direct parent mapping using numerical IDs
parent_level_query = """
CREATE TABLE parent_level AS
SELECT
u.child_tmp_id, -- Numerical ID for the child model
b.tmp_id AS base_tmp_id -- Numerical ID for the base model (parent)
FROM unnested_models u
JOIN unique_ids b ON u.base_model = b.id
"""
execute_with_timing(parent_level_query, "Step 3: Created parent_level table")
# Step 4: Recursive CTE to find all ancestor-children mappings using numerical IDs
ancestor_children_query = """
CREATE TABLE ancestor_children AS
WITH RECURSIVE ancestor_children_cte AS (
SELECT
base_tmp_id AS ancestor_tmp_id, -- Start with direct parent as ancestor
child_tmp_id AS child_tmp_id, -- Direct child
1 AS depth -- Initialize depth counter
FROM parent_level
UNION ALL
SELECT
ac.ancestor_tmp_id, -- Propagate ancestor
pl.child_tmp_id, -- Find new child in the chain
ac.depth + 1 -- Increment depth counter
FROM parent_level pl
JOIN ancestor_children_cte ac ON pl.base_tmp_id = ac.child_tmp_id
WHERE ac.depth < 20 -- Limit recursion to 10 levels
)
SELECT
a.id AS ancestor,
LIST(DISTINCT c.id) AS all_children
FROM ancestor_children_cte ac
JOIN unique_ids a ON ac.ancestor_tmp_id = a.tmp_id
JOIN unique_ids c ON c.tmp_id = ac.child_tmp_id
GROUP BY ancestor
"""
execute_with_timing(ancestor_children_query, "Step 4: Created ancestor_children table with string IDs")
# Create a direct children mapping table
direct_children_mapping_query = """
CREATE TABLE direct_children_mapping AS
SELECT
p.id AS parent,
LIST(DISTINCT u.id) AS direct_children
FROM parents_in_memory p
LEFT JOIN unnested_models um ON p.id = um.base_model
LEFT JOIN unique_ids u ON um.child_tmp_id = u.tmp_id
GROUP BY p.id
"""
execute_with_timing(direct_children_mapping_query, "Created direct_children_mapping table")
# Write the final result to a parquet file, using direct_children_mapping for direct_children
start_time = time.perf_counter()
final_output_query = """
COPY (
SELECT
ac.ancestor as ancestor,
dcm.direct_children as direct_children,
ac.all_children as all_children,
CAST(array_length(ac.all_children) AS INTEGER) as all_children_count,
CAST(array_length(dcm.direct_children) AS INTEGER) as direct_children_count
FROM ancestor_children ac
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
ORDER BY all_children_count DESC
) TO 'public/ancestor_children.parquet' (FORMAT 'parquet')
"""
con.execute(final_output_query)
end_time = time.perf_counter()
logging.info(f"Written ancestor_children to parquet file in {end_time - start_time:.6f} seconds.")
# Write a random sample of 10 rows with non-empty children to yaml file for inspection
start_time = time.perf_counter()
sample_query = """
SELECT ac.ancestor, dcm.direct_children, ac.all_children
FROM ancestor_children ac
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
WHERE array_length(ac.all_children) > 0
LIMIT 10
"""
sample_data = con.execute(sample_query).fetchall()
with open("public/ancestor_children.example.yaml", "w") as f:
yaml.safe_dump(sample_data, f, default_flow_style=False)
end_time = time.perf_counter()
logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.")
# Write a random sample of 10 rows with no children (direct or indirect) to yaml file
start_time = time.perf_counter()
no_children_query = """
SELECT ac.ancestor, dcm.direct_children, ac.all_children
FROM ancestor_children ac
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
WHERE array_length(ac.all_children) = 0
LIMIT 10
"""
no_children_data = con.execute(no_children_query).fetchall()
end_time = time.perf_counter()
logging.info(f"Fetched sample data of models with no children in {end_time - start_time:.6f} seconds.")
logging.info("Examples of models with no children (direct or indirect):")
for model in no_children_data:
logging.info(model)
# List top 10 ancestors with the most children and their number of direct children
start_time = time.perf_counter()
top_ancestors_query = """
SELECT
ac.ancestor,
array_length(ac.all_children) AS num_all_children,
array_length(dcm.direct_children) AS num_direct_children
FROM ancestor_children ac
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
ORDER BY num_all_children DESC
LIMIT 10
"""
top_ancestors = con.execute(top_ancestors_query).fetchall()
end_time = time.perf_counter()
logging.info(f"Listed top 10 ancestors with the most children in {end_time - start_time:.6f} seconds.")
logging.info("Top 10 ancestors with the most children and their number of direct children:")
for ancestor in top_ancestors:
logging.info(ancestor)
# Log the total processing time
total_execution_time = time.perf_counter() - total_start_time
print(f"Total processing time: {total_execution_time:.6f} seconds")
|