Xianbao QIAN commited on
Commit
c80b461
β€’
1 Parent(s): 2ea1dfc

use client side rendering for the homepage.

Browse files
@types/parquetjs-lite.d.ts ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ declare module 'parquetjs-lite' {
2
+ export class ParquetReader {
3
+ static openFile(filePath: string): Promise<ParquetReader>;
4
+ getCursor(): any;
5
+ close(): Promise<void>;
6
+ }
7
+ }
Dockerfile CHANGED
@@ -34,6 +34,9 @@ RUN \
34
  addgroup --system --gid 1001 nodejs; \
35
  adduser --system --uid 1001 nextjs
36
 
 
 
 
37
  # Automatically leverage output traces to reduce image size
38
  COPY --from=builder --link --chown=1001:1001 /app/.next/standalone ./
39
  COPY --from=builder --link --chown=1001:1001 /app/.next/static ./.next/static
 
34
  addgroup --system --gid 1001 nodejs; \
35
  adduser --system --uid 1001 nextjs
36
 
37
+ # Public files are served by nextjs.
38
+ COPY --from=builder --link /app/public ./public
39
+
40
  # Automatically leverage output traces to reduce image size
41
  COPY --from=builder --link --chown=1001:1001 /app/.next/standalone ./
42
  COPY --from=builder --link --chown=1001:1001 /app/.next/static ./.next/static
bun.lockb CHANGED
Binary files a/bun.lockb and b/bun.lockb differ
 
package.json CHANGED
@@ -9,8 +9,8 @@
9
  "lint": "next lint"
10
  },
11
  "dependencies": {
 
12
  "next": "14.2.15",
13
- "parquetjs-lite": "^0.8.7",
14
  "react": "^18",
15
  "react-dom": "^18"
16
  },
 
9
  "lint": "next lint"
10
  },
11
  "dependencies": {
12
+ "@duckdb/duckdb-wasm": "^1.29.0",
13
  "next": "14.2.15",
 
14
  "react": "^18",
15
  "react-dom": "^18"
16
  },
{tables β†’ public}/ancestor_children.example.yaml RENAMED
File without changes
{tables β†’ public}/ancestor_children.parquet RENAMED
File without changes
{tables β†’ public}/datasets.example.yaml RENAMED
File without changes
{tables β†’ public}/datasets.parquet RENAMED
File without changes
{tables β†’ public}/models.example.yaml RENAMED
File without changes
{tables β†’ public}/models.parquet RENAMED
File without changes
{tables β†’ public}/parents.example.yaml RENAMED
File without changes
{tables β†’ public}/parents.parquet RENAMED
File without changes
{tables β†’ public}/spaces.example.yaml RENAMED
File without changes
{tables β†’ public}/spaces.parquet RENAMED
File without changes
python/0_download_files.py CHANGED
@@ -7,8 +7,8 @@ import random
7
  import argparse
8
  import yaml
9
 
10
- # Create the "tables" folders if they don't exist
11
- os.makedirs("tables", exist_ok=True)
12
 
13
  # URLs of the files to download
14
  urls = [
@@ -18,7 +18,7 @@ urls = [
18
  ]
19
 
20
  def download_file(url, overwrite=True):
21
- filename = os.path.join("tables", url.split("/")[-1].split("?")[0])
22
 
23
  if not overwrite and os.path.exists(filename):
24
  print(f"File already exists: {filename}. Skipping download.")
@@ -56,7 +56,7 @@ def main(overwrite):
56
 
57
  # Process each downloaded Parquet file
58
  for url in urls:
59
- filename = os.path.join("tables", url.split("/")[-1].split("?")[0])
60
  table_name = os.path.splitext(os.path.basename(filename))[0]
61
 
62
  # Connect to the Parquet file using DuckDB
@@ -86,8 +86,8 @@ def main(overwrite):
86
  yaml_content = yaml_content.rstrip() # Remove trailing spaces
87
  yaml_content += "\n"
88
 
89
- # Save the YAML content to a file in the "tables" folder
90
- yaml_file = os.path.join("tables", f"{table_name}.example.yaml")
91
  with open(yaml_file, "w") as file:
92
  file.write(yaml_content)
93
 
 
7
  import argparse
8
  import yaml
9
 
10
+ # Create the "public" folders if they don't exist
11
+ os.makedirs("public", exist_ok=True)
12
 
13
  # URLs of the files to download
14
  urls = [
 
18
  ]
19
 
20
  def download_file(url, overwrite=True):
21
+ filename = os.path.join("public", url.split("/")[-1].split("?")[0])
22
 
23
  if not overwrite and os.path.exists(filename):
24
  print(f"File already exists: {filename}. Skipping download.")
 
56
 
57
  # Process each downloaded Parquet file
58
  for url in urls:
59
+ filename = os.path.join("public", url.split("/")[-1].split("?")[0])
60
  table_name = os.path.splitext(os.path.basename(filename))[0]
61
 
62
  # Connect to the Parquet file using DuckDB
 
86
  yaml_content = yaml_content.rstrip() # Remove trailing spaces
87
  yaml_content += "\n"
88
 
89
+ # Save the YAML content to a file in the "public" folder
90
+ yaml_file = os.path.join("public", f"{table_name}.example.yaml")
91
  with open(yaml_file, "w") as file:
92
  file.write(yaml_content)
93
 
python/1_parents.py CHANGED
@@ -23,7 +23,7 @@ query = """
23
  _id,
24
  id,
25
  extract_base_models(tags) AS base_models
26
- FROM parquet_scan('tables/models.parquet')
27
  """
28
 
29
  start_time = time.time()
@@ -32,7 +32,7 @@ start_time = time.time()
32
  con.execute(f"CREATE VIEW parent_models AS {query}")
33
 
34
  # Write the view to a parquet file
35
- con.execute("COPY parent_models TO 'tables/parents.parquet' (FORMAT 'parquet')")
36
 
37
  end_time = time.time()
38
  execution_time = end_time - start_time
@@ -55,5 +55,5 @@ result = con.execute("""
55
  LIMIT 10
56
  """).fetchall()
57
 
58
- with open("tables/parents.example.yaml", "w") as f:
59
  yaml.safe_dump(result, f, default_flow_style=False)
 
23
  _id,
24
  id,
25
  extract_base_models(tags) AS base_models
26
+ FROM parquet_scan('public/models.parquet')
27
  """
28
 
29
  start_time = time.time()
 
32
  con.execute(f"CREATE VIEW parent_models AS {query}")
33
 
34
  # Write the view to a parquet file
35
+ con.execute("COPY parent_models TO 'public/parents.parquet' (FORMAT 'parquet')")
36
 
37
  end_time = time.time()
38
  execution_time = end_time - start_time
 
55
  LIMIT 10
56
  """).fetchall()
57
 
58
+ with open("public/parents.example.yaml", "w") as f:
59
  yaml.safe_dump(result, f, default_flow_style=False)
python/2_ancestors.py CHANGED
@@ -22,7 +22,7 @@ total_start_time = time.perf_counter()
22
  # Load parents.parquet into an in-memory table
23
  load_parents_query = """
24
  CREATE TABLE parents_in_memory AS
25
- SELECT * FROM parquet_scan('tables/parents.parquet')
26
  """
27
  execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM")
28
 
@@ -115,7 +115,7 @@ final_output_query = """
115
  FROM ancestor_children ac
116
  LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
117
  ORDER BY all_children_count DESC
118
- ) TO 'tables/ancestor_children.parquet' (FORMAT 'parquet')
119
  """
120
  con.execute(final_output_query)
121
  end_time = time.perf_counter()
@@ -131,7 +131,7 @@ sample_query = """
131
  LIMIT 10
132
  """
133
  sample_data = con.execute(sample_query).fetchall()
134
- with open("tables/ancestor_children.example.yaml", "w") as f:
135
  yaml.safe_dump(sample_data, f, default_flow_style=False)
136
  end_time = time.perf_counter()
137
  logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.")
 
22
  # Load parents.parquet into an in-memory table
23
  load_parents_query = """
24
  CREATE TABLE parents_in_memory AS
25
+ SELECT * FROM parquet_scan('public/parents.parquet')
26
  """
27
  execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM")
28
 
 
115
  FROM ancestor_children ac
116
  LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
117
  ORDER BY all_children_count DESC
118
+ ) TO 'public/ancestor_children.parquet' (FORMAT 'parquet')
119
  """
120
  con.execute(final_output_query)
121
  end_time = time.perf_counter()
 
131
  LIMIT 10
132
  """
133
  sample_data = con.execute(sample_query).fetchall()
134
+ with open("public/ancestor_children.example.yaml", "w") as f:
135
  yaml.safe_dump(sample_data, f, default_flow_style=False)
136
  end_time = time.perf_counter()
137
  logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.")
src/app/page.tsx CHANGED
@@ -1,6 +1,7 @@
1
- import fs from 'fs';
2
- import path from 'path';
3
- import { ParquetReader } from 'parquetjs-lite';
 
4
 
5
  type ModelData = {
6
  ancestor: string;
@@ -10,41 +11,87 @@ type ModelData = {
10
  direct_children_count: number | null;
11
  };
12
 
13
- export default async function Home() {
14
- try {
15
- // Read the Parquet file using parquetjs-lite
16
- const parquetFilePath = path.join(process.cwd(), 'tables', 'ancestor_children.parquet');
17
- const reader = await ParquetReader.openFile(parquetFilePath);
18
- const cursor = reader.getCursor();
19
-
20
- // Read all rows and convert to a JavaScript array
21
- const data: ModelData[] = [];
22
- let row = null;
23
- while (row = await cursor.next()) {
24
- data.push({
25
- ancestor: row.ancestor,
26
- direct_children: row.direct_children,
27
- all_children: row.all_children,
28
- all_children_count: row.all_children_count,
29
- direct_children_count: row.direct_children_count,
30
- });
31
- // console.log(row.all_children.list.length)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
- await reader.close();
 
34
 
35
- // console.log('Data:', data);
 
 
36
 
37
- // Get the top 10 models with the most all_children
38
- const top10Models = data
39
- .sort((a, b) => b.all_children.length - a.all_children.length)
40
- .slice(0, 10);
41
 
42
- // console.log('Top 10 Models:', top10Models);
 
 
 
43
 
44
- return (
45
- <main className="container mx-auto py-8 text-gray-900 dark:text-white">
46
- <h1 className="text-4xl font-bold mb-4">Top 10 Models with the Most All Children</h1>
47
- {top10Models.length > 0 ? (
 
 
 
 
 
 
 
 
 
 
48
  <table className="table-auto border-collapse w-full">
49
  <thead>
50
  <tr>
@@ -54,7 +101,7 @@ export default async function Home() {
54
  </tr>
55
  </thead>
56
  <tbody>
57
- {top10Models.map((model, index) => (
58
  <tr key={index} className="border-t border-gray-200 dark:border-gray-700">
59
  <td className="px-4 py-2">{model.ancestor}</td>
60
  <td className="px-4 py-2 text-right">{model.direct_children_count ?? 0}</td>
@@ -63,18 +110,26 @@ export default async function Home() {
63
  ))}
64
  </tbody>
65
  </table>
66
- ) : (
67
- <p>No data found.</p>
68
- )}
69
- </main>
70
- );
71
- } catch (error) {
72
- console.error('Error:', error);
73
- return (
74
- <main className="container mx-auto py-8 text-gray-900 dark:text-white">
75
- <h1 className="text-4xl font-bold mb-4">Error</h1>
76
- <p>An error occurred while processing the data: {error.message}</p>
77
- </main>
78
- );
79
- }
 
 
 
 
 
 
 
 
80
  }
 
1
+ 'use client';
2
+
3
+ import { useState, useEffect } from 'react';
4
+ import * as duckdb from '@duckdb/duckdb-wasm';
5
 
6
  type ModelData = {
7
  ancestor: string;
 
11
  direct_children_count: number | null;
12
  };
13
 
14
+ export default function Home() {
15
+ const [allModels, setAllModels] = useState<ModelData[]>([]);
16
+ const [currentPage, setCurrentPage] = useState(1);
17
+ const [pageSize, setPageSize] = useState(100);
18
+ const [filterText, setFilterText] = useState('');
19
+
20
+ useEffect(() => {
21
+ async function fetchData() {
22
+ const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles();
23
+
24
+ // Select a bundle based on browser checks
25
+ const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES);
26
+
27
+ const worker_url = URL.createObjectURL(
28
+ new Blob([`importScripts("${bundle.mainWorker!}");`], { type: 'text/javascript' })
29
+ );
30
+
31
+ // Instantiate the asynchronous version of DuckDB-Wasm
32
+ const worker = new Worker(worker_url);
33
+ const logger = new duckdb.ConsoleLogger();
34
+ const db = new duckdb.AsyncDuckDB(logger, worker);
35
+ await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
36
+
37
+ // Register the Parquet file using the URL
38
+ await db.registerFileURL(
39
+ 'ancestor_children.parquet',
40
+ `${window.location.origin}/ancestor_children.parquet`,
41
+ duckdb.DuckDBDataProtocol.HTTP,
42
+ false
43
+ );
44
+
45
+ // Execute the SQL query using the registered Parquet file
46
+ const query = `
47
+ SELECT
48
+ ancestor,
49
+ direct_children,
50
+ all_children,
51
+ CAST(all_children_count AS INTEGER) AS all_children_count,
52
+ CAST(direct_children_count AS INTEGER) AS direct_children_count
53
+ FROM 'ancestor_children.parquet'
54
+ `;
55
+ const conn = await db.connect();
56
+ const result = await conn.query(query);
57
+
58
+ // Convert the result to a JavaScript array
59
+ const data: ModelData[] = result.toArray();
60
+
61
+ // Close the connection and terminate the worker
62
+ await conn.close();
63
+ await db.terminate();
64
+
65
+ setAllModels(data);
66
  }
67
+ fetchData();
68
+ }, []);
69
 
70
+ const filteredModels = allModels.filter((model) =>
71
+ model.ancestor.toLowerCase().includes(filterText.toLowerCase())
72
+ );
73
 
74
+ const totalPages = Math.ceil(filteredModels.length / pageSize);
 
 
 
75
 
76
+ const paginatedModels = filteredModels.slice(
77
+ (currentPage - 1) * pageSize,
78
+ currentPage * pageSize
79
+ );
80
 
81
+ return (
82
+ <main className="container mx-auto py-8 text-gray-900 dark:text-white">
83
+ <h1 className="text-4xl font-bold mb-4">All Models</h1>
84
+ <div className="mb-4">
85
+ <input
86
+ type="text"
87
+ placeholder="Filter by model name"
88
+ value={filterText}
89
+ onChange={(e) => setFilterText(e.target.value)}
90
+ className="px-4 py-2 border border-gray-300 rounded-md"
91
+ />
92
+ </div>
93
+ {paginatedModels.length > 0 ? (
94
+ <>
95
  <table className="table-auto border-collapse w-full">
96
  <thead>
97
  <tr>
 
101
  </tr>
102
  </thead>
103
  <tbody>
104
+ {paginatedModels.map((model, index) => (
105
  <tr key={index} className="border-t border-gray-200 dark:border-gray-700">
106
  <td className="px-4 py-2">{model.ancestor}</td>
107
  <td className="px-4 py-2 text-right">{model.direct_children_count ?? 0}</td>
 
110
  ))}
111
  </tbody>
112
  </table>
113
+ <div className="mt-4">
114
+ <button
115
+ onClick={() => setCurrentPage((prev) => Math.max(prev - 1, 1))}
116
+ disabled={currentPage === 1}
117
+ className="px-4 py-2 bg-blue-500 text-white rounded-md mr-2"
118
+ >
119
+ Previous
120
+ </button>
121
+ <button
122
+ onClick={() => setCurrentPage((prev) => Math.min(prev + 1, totalPages))}
123
+ disabled={currentPage === totalPages}
124
+ className="px-4 py-2 bg-blue-500 text-white rounded-md"
125
+ >
126
+ Next
127
+ </button>
128
+ </div>
129
+ </>
130
+ ) : (
131
+ <p>No data found.</p>
132
+ )}
133
+ </main>
134
+ );
135
  }