|
import duckdb |
|
import yaml |
|
import time |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
con = duckdb.connect() |
|
|
|
def execute_with_timing(query, description): |
|
"""Execute a DuckDB query and log the execution time.""" |
|
start_time = time.perf_counter() |
|
con.execute(query) |
|
end_time = time.perf_counter() |
|
logging.info(f"Completed {description} in {end_time - start_time:.6f} seconds.") |
|
|
|
|
|
total_start_time = time.perf_counter() |
|
|
|
|
|
load_parents_query = """ |
|
CREATE TABLE parents_in_memory AS |
|
SELECT * FROM parquet_scan('public/parents.parquet') |
|
""" |
|
execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM") |
|
|
|
|
|
unique_id_query = """ |
|
CREATE TABLE unique_ids AS |
|
SELECT |
|
id, |
|
ROW_NUMBER() OVER () AS tmp_id |
|
FROM parents_in_memory |
|
""" |
|
execute_with_timing(unique_id_query, "Step 1: Created unique_ids table") |
|
|
|
|
|
unnest_query = """ |
|
CREATE TABLE unnested_models AS |
|
SELECT |
|
u.tmp_id AS child_tmp_id, -- Numerical ID for the child model |
|
UNNEST(p.base_models) AS base_model |
|
FROM parents_in_memory p |
|
JOIN unique_ids u ON p.id = u.id |
|
WHERE p.base_models IS NOT NULL -- Filter out models without base models |
|
""" |
|
execute_with_timing(unnest_query, "Step 2: Created unnested_models table") |
|
|
|
|
|
parent_level_query = """ |
|
CREATE TABLE parent_level AS |
|
SELECT |
|
u.child_tmp_id, -- Numerical ID for the child model |
|
b.tmp_id AS base_tmp_id -- Numerical ID for the base model (parent) |
|
FROM unnested_models u |
|
JOIN unique_ids b ON u.base_model = b.id |
|
""" |
|
execute_with_timing(parent_level_query, "Step 3: Created parent_level table") |
|
|
|
|
|
ancestor_children_query = """ |
|
CREATE TABLE ancestor_children AS |
|
WITH RECURSIVE ancestor_children_cte AS ( |
|
SELECT |
|
base_tmp_id AS ancestor_tmp_id, -- Start with direct parent as ancestor |
|
child_tmp_id AS child_tmp_id, -- Direct child |
|
1 AS depth -- Initialize depth counter |
|
FROM parent_level |
|
|
|
UNION ALL |
|
|
|
SELECT |
|
ac.ancestor_tmp_id, -- Propagate ancestor |
|
pl.child_tmp_id, -- Find new child in the chain |
|
ac.depth + 1 -- Increment depth counter |
|
FROM parent_level pl |
|
JOIN ancestor_children_cte ac ON pl.base_tmp_id = ac.child_tmp_id |
|
WHERE ac.depth < 20 -- Limit recursion to 10 levels |
|
) |
|
SELECT |
|
a.id AS ancestor, |
|
LIST(DISTINCT c.id) AS all_children |
|
FROM ancestor_children_cte ac |
|
JOIN unique_ids a ON ac.ancestor_tmp_id = a.tmp_id |
|
JOIN unique_ids c ON c.tmp_id = ac.child_tmp_id |
|
GROUP BY ancestor |
|
""" |
|
execute_with_timing(ancestor_children_query, "Step 4: Created ancestor_children table with string IDs") |
|
|
|
|
|
direct_children_mapping_query = """ |
|
CREATE TABLE direct_children_mapping AS |
|
SELECT |
|
p.id AS parent, |
|
LIST(DISTINCT u.id) AS direct_children |
|
FROM parents_in_memory p |
|
LEFT JOIN unnested_models um ON p.id = um.base_model |
|
LEFT JOIN unique_ids u ON um.child_tmp_id = u.tmp_id |
|
GROUP BY p.id |
|
""" |
|
execute_with_timing(direct_children_mapping_query, "Created direct_children_mapping table") |
|
|
|
|
|
start_time = time.perf_counter() |
|
final_output_query = """ |
|
COPY ( |
|
SELECT |
|
ac.ancestor as ancestor, |
|
dcm.direct_children as direct_children, |
|
ac.all_children as all_children, |
|
CAST(array_length(ac.all_children) AS INTEGER) as all_children_count, |
|
CAST(array_length(dcm.direct_children) AS INTEGER) as direct_children_count |
|
FROM ancestor_children ac |
|
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent |
|
ORDER BY all_children_count DESC |
|
) TO 'public/ancestor_children.parquet' (FORMAT 'parquet') |
|
""" |
|
con.execute(final_output_query) |
|
end_time = time.perf_counter() |
|
logging.info(f"Written ancestor_children to parquet file in {end_time - start_time:.6f} seconds.") |
|
|
|
|
|
start_time = time.perf_counter() |
|
sample_query = """ |
|
SELECT ac.ancestor, dcm.direct_children, ac.all_children |
|
FROM ancestor_children ac |
|
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent |
|
WHERE array_length(ac.all_children) > 0 |
|
LIMIT 10 |
|
""" |
|
sample_data = con.execute(sample_query).fetchall() |
|
with open("public/ancestor_children.example.yaml", "w") as f: |
|
yaml.safe_dump(sample_data, f, default_flow_style=False) |
|
end_time = time.perf_counter() |
|
logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.") |
|
|
|
|
|
start_time = time.perf_counter() |
|
no_children_query = """ |
|
SELECT ac.ancestor, dcm.direct_children, ac.all_children |
|
FROM ancestor_children ac |
|
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent |
|
WHERE array_length(ac.all_children) = 0 |
|
LIMIT 10 |
|
""" |
|
no_children_data = con.execute(no_children_query).fetchall() |
|
end_time = time.perf_counter() |
|
logging.info(f"Fetched sample data of models with no children in {end_time - start_time:.6f} seconds.") |
|
logging.info("Examples of models with no children (direct or indirect):") |
|
for model in no_children_data: |
|
logging.info(model) |
|
|
|
|
|
start_time = time.perf_counter() |
|
top_ancestors_query = """ |
|
SELECT |
|
ac.ancestor, |
|
array_length(ac.all_children) AS num_all_children, |
|
array_length(dcm.direct_children) AS num_direct_children |
|
FROM ancestor_children ac |
|
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent |
|
ORDER BY num_all_children DESC |
|
LIMIT 10 |
|
""" |
|
top_ancestors = con.execute(top_ancestors_query).fetchall() |
|
end_time = time.perf_counter() |
|
logging.info(f"Listed top 10 ancestors with the most children in {end_time - start_time:.6f} seconds.") |
|
logging.info("Top 10 ancestors with the most children and their number of direct children:") |
|
for ancestor in top_ancestors: |
|
logging.info(ancestor) |
|
|
|
|
|
total_execution_time = time.perf_counter() - total_start_time |
|
print(f"Total processing time: {total_execution_time:.6f} seconds") |
|
|