mbudisic commited on
Commit
3b978ee
·
1 Parent(s): 9e054dd

dev: simplifying datastore functions

Browse files
app_simple_rag.py CHANGED
@@ -59,7 +59,7 @@ params = ApplicationParameters()
59
  async def fill_the_db():
60
  if state.datastore_manager.count_docs() == 0:
61
  data: List[Dict[str, Any]] = await load_json_files(params.filename)
62
- state.pointsLoaded = await state.datastore_manager.populate_database(
63
  raw_docs=data
64
  )
65
  await cl.Message(
 
59
  async def fill_the_db():
60
  if state.datastore_manager.count_docs() == 0:
61
  data: List[Dict[str, Any]] = await load_json_files(params.filename)
62
+ state.pointsLoaded = await state.datastore_manager.embed_chunks(
63
  raw_docs=data
64
  )
65
  await cl.Message(
notebooks/transcript_rag.ipynb CHANGED
@@ -86,7 +86,7 @@
86
  "name": "stderr",
87
  "output_type": "stream",
88
  "text": [
89
- "2025-05-30 20:08:35,183 - INFO - <module> - Loaded .env file\n"
90
  ]
91
  }
92
  ],
@@ -103,16 +103,22 @@
103
  "name": "stderr",
104
  "output_type": "stream",
105
  "text": [
106
- "2025-05-30 20:08:36,978 - INFO - print - Configuration parameters:\n",
107
- "2025-05-30 20:08:36,980 - INFO - print - eva_workflow_name: EVA_workflow\n",
108
- "2025-05-30 20:08:36,980 - INFO - print - eva_log_level: INFO\n",
109
- "2025-05-30 20:08:36,981 - INFO - print - transcript_glob: ./data/dev.json:./data/test.json\n",
110
- "2025-05-30 20:08:36,982 - INFO - print - embedding_model: mxbai-embed-large\n",
111
- "2025-05-30 20:08:36,983 - INFO - print - embedding_api: ModelAPI.OLLAMA\n",
112
- "2025-05-30 20:08:36,984 - INFO - print - llm_api: ModelAPI.OLLAMA\n",
113
- "2025-05-30 20:08:36,985 - INFO - print - max_research_loops: 2\n",
114
- "2025-05-30 20:08:36,986 - INFO - print - llm_tool_model: deepseek-r1:8b\n",
115
- "2025-05-30 20:08:36,987 - INFO - print - n_context_docs: 3\n"
 
 
 
 
 
 
116
  ]
117
  }
118
  ],
@@ -129,13 +135,25 @@
129
  "name": "stderr",
130
  "output_type": "stream",
131
  "text": [
132
- "2025-05-30 20:08:37,093 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
133
- "2025-05-30 20:08:37,118 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
134
  ]
 
 
 
 
 
 
 
 
 
 
135
  }
136
  ],
137
  "source": [
138
- "datastore:DatastoreManager = startup(callback_on_loading_complete=lambda _: logging.warning(\"Loading complete.\")) "
 
 
139
  ]
140
  },
141
  {
@@ -147,39 +165,39 @@
147
  "name": "stderr",
148
  "output_type": "stream",
149
  "text": [
150
- "2025-05-30 20:08:38,120 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
151
- "2025-05-30 20:08:39,173 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
152
- "2025-05-30 20:08:39,862 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
153
- "2025-05-30 20:08:40,765 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
154
- "2025-05-30 20:08:41,275 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
155
- "2025-05-30 20:08:41,539 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
156
- "2025-05-30 20:08:42,447 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
157
- "2025-05-30 20:08:43,415 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
158
- "2025-05-30 20:08:44,236 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
159
- "2025-05-30 20:08:45,746 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
160
- "2025-05-30 20:08:45,770 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
161
- "2025-05-30 20:08:46,832 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
162
- "2025-05-30 20:08:47,754 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
163
- "2025-05-30 20:08:48,859 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
164
- "2025-05-30 20:08:49,732 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
165
- "2025-05-30 20:08:50,740 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
166
- "2025-05-30 20:08:51,604 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
167
- "2025-05-30 20:08:52,113 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
168
- "2025-05-30 20:08:53,060 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
169
- "2025-05-30 20:08:53,895 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
170
- "2025-05-30 20:08:54,734 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
171
- "2025-05-30 20:08:55,707 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
172
- "2025-05-30 20:08:56,114 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
173
- "2025-05-30 20:08:56,447 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
174
- "2025-05-30 20:08:56,765 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
175
- "2025-05-30 20:08:56,878 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
176
- "2025-05-30 20:08:57,200 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
177
- "2025-05-30 20:08:57,438 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
178
- "2025-05-30 20:08:57,750 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
179
- "2025-05-30 20:08:58,116 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
180
- "2025-05-30 20:08:58,713 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
181
- "2025-05-30 20:08:59,059 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
182
- "2025-05-30 20:08:59,110 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
183
  ]
184
  },
185
  {
@@ -203,7 +221,7 @@
203
  "metadata": {},
204
  "outputs": [],
205
  "source": [
206
- "chain = retrieve_videos(datastore)"
207
  ]
208
  },
209
  {
@@ -215,8 +233,8 @@
215
  "name": "stderr",
216
  "output_type": "stream",
217
  "text": [
218
- "2025-05-30 20:08:59,268 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
219
- "2025-05-30 20:09:11,924 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/chat \"HTTP/1.1 200 OK\"\n"
220
  ]
221
  }
222
  ],
@@ -236,29 +254,20 @@
236
  "==================================\u001b[1m Ai Message \u001b[0m==================================\n",
237
  "\n",
238
  "<think>\n",
239
- "Okay, I need to figure out what a layer is based on the provided transcript. Let me go through the context step by step.\n",
240
- "\n",
241
- "First, looking at the first document with video ID 19172. The title says \"Understand layers\" and the description mentions that layers are the building blocks of any image in Photoshop CC. So, layers must be fundamental components.\n",
242
  "\n",
243
- "In the page content, it compares layers to separate flat prints of glass stacked on top of each other. Each layer has different content. That makes me think of layers as separate elements or parts of an image that can be edited individually.\n",
244
  "\n",
245
- "There's also a mention of the Layers panel where you select and work with layers. It shows 4 layers, each with distinct content. The Eye icon can toggle visibility, so layers can be shown or hidden. This suggests that layers are like different layers of content that can be managed separately.\n",
246
  "\n",
247
- "Looking at another document with video ID 4103, it says layers allow isolation of individual pieces of content within a composition. For example, text or brush strokes can be separated into their own layers for detailed editing without affecting other parts.\n",
248
  "\n",
249
- "Another mention from the same video talks about using layers for adding highlights or shadows by isolating small elements, which means each part can be worked on independently.\n",
250
- "\n",
251
- "Putting this together, a layer is like a separate sheet in an image that holds different elements. Each layer allows you to edit, move, or manipulate specific parts without affecting others. The Layers panel helps manage and control these layers for better organization and editing flexibility.\n",
252
  "</think>\n",
253
  "\n",
254
- "🎨 **What is a Layer?** \n",
255
- "Layers are like separate sheets in an image, each holding distinct content. Think of them as individual elements stacked on top of each other, allowing you to edit or manipulate specific parts without affecting others. \n",
256
- "\n",
257
- "For example: \n",
258
- "- Each layer can contain text, images, or design elements. \n",
259
- "- You can toggle their visibility using the Eye icon. \n",
260
  "\n",
261
- "📌 **Timestamp**: 0.47 - 3.41 minutes (video ID 19172)\n",
262
  "**REFERENCES**\n",
263
  "[\n",
264
  " {\n",
@@ -295,9 +304,9 @@
295
  {
296
  "data": {
297
  "text/plain": [
298
- "[Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 63, '_collection_name': 'dc0cf104-0069-4983-8a12-8d3de4132745'}, page_content=\"Layers are the building blocks of any image in Photoshop CC. So, it's important to understand, what layers are and why to use them - which we'll cover in this video. If you're following along, open this layered image from the downloadable practice files for this tutorial. You might think of layers like separate flat pints of glass, stacked one on top of the other. Each layer contains separate pieces of content. To get a sense of how layers are constructed, let's take a look at this Layers panel. I've closed my other panels, so that we can focus on the Layers panel. But you can skip that. By the way: If your Layers panel isn't showing, go up to the Window menu and choose Layers from there. The Layers panel is where you go to select and work with layers. In this image there are 4 layers, each with separate content. If you click the Eye icon to the left of a layer, you can toggle the visibility of that layer off and on. So, I'm going to turn off the visibility of the tailor layer. And keep your eye on the image, so you can see what's on that layer.\"),\n",
299
- " Document(metadata={'video_id': 4103, 'title': 'Use layers for ultimate flexibility and control', 'desc': 'Learn how to use layers to create designs, fix photos, or build collages.', 'length': '00:05:06.55', 'group': 'data/dev.json', 'source': 'https://videos-tv.adobe.com/2014-09-04/96f51d8958ae31b37cb5a15cbdc21744.mp4', 'speech_start_stop_times': [[0.82, 5.88], [6.51, 18.389999], [19.219999, 30.13]], 'start': 0.82, 'stop': 30.13, '_id': 0, '_collection_name': 'dc0cf104-0069-4983-8a12-8d3de4132745'}, page_content=\"As a new Photoshop user, you're going to find that the layers panel is an incredibly powerful tool. The layers panel gives you the ability to isolate individual pieces of content away from the rest of the composition giving you the ability to work on individual elements within the overall document. Now, this can be used for something as literal as some type in this case, or something as subtle as a small brush stroke to add a highlight or shadow to an image.\"),\n",
300
- " Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[280.4, 284.58]], 'start': 280.4, 'stop': 284.58, '_id': 66, '_collection_name': 'dc0cf104-0069-4983-8a12-8d3de4132745'}, page_content=\"There's lots more to learn about layers, so stay tuned for the rest of this tutorial.\")]"
301
  ]
302
  },
303
  "execution_count": 13,
 
86
  "name": "stderr",
87
  "output_type": "stream",
88
  "text": [
89
+ "2025-05-30 23:53:26,776 - INFO - <module> - Loaded .env file\n"
90
  ]
91
  }
92
  ],
 
103
  "name": "stderr",
104
  "output_type": "stream",
105
  "text": [
106
+ "2025-05-30 23:53:28,613 - INFO - print - Configuration parameters:\n"
107
+ ]
108
+ },
109
+ {
110
+ "name": "stderr",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "2025-05-30 23:53:28,614 - INFO - print - eva_workflow_name: EVA_workflow\n",
114
+ "2025-05-30 23:53:28,615 - INFO - print - eva_log_level: INFO\n",
115
+ "2025-05-30 23:53:28,615 - INFO - print - transcript_glob: ./data/dev.json:./data/test.json\n",
116
+ "2025-05-30 23:53:28,616 - INFO - print - embedding_model: mxbai-embed-large\n",
117
+ "2025-05-30 23:53:28,617 - INFO - print - embedding_api: ModelAPI.OLLAMA\n",
118
+ "2025-05-30 23:53:28,617 - INFO - print - llm_api: ModelAPI.OLLAMA\n",
119
+ "2025-05-30 23:53:28,618 - INFO - print - max_research_loops: 2\n",
120
+ "2025-05-30 23:53:28,619 - INFO - print - llm_tool_model: qwen3:4b\n",
121
+ "2025-05-30 23:53:28,620 - INFO - print - n_context_docs: 3\n"
122
  ]
123
  }
124
  ],
 
135
  "name": "stderr",
136
  "output_type": "stream",
137
  "text": [
138
+ "2025-05-30 23:53:29,748 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
139
+ "2025-05-30 23:53:29,781 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
140
  ]
141
+ },
142
+ {
143
+ "data": {
144
+ "text/plain": [
145
+ "<Task pending name='Task-1' coro=<DatastoreManager.from_json_globs() running at /home/mbudisic/Documents/PsTuts-RAG/pstuts_rag/pstuts_rag/datastore.py:105>>"
146
+ ]
147
+ },
148
+ "execution_count": 8,
149
+ "metadata": {},
150
+ "output_type": "execute_result"
151
  }
152
  ],
153
  "source": [
154
+ "datastore = DatastoreManager()\n",
155
+ "datastore.add_completion_callback(lambda _: logging.warning(\"Loading complete.\"))\n",
156
+ "asyncio.create_task(datastore.from_json_globs(Configuration().transcript_glob))"
157
  ]
158
  },
159
  {
 
165
  "name": "stderr",
166
  "output_type": "stream",
167
  "text": [
168
+ "2025-05-30 23:53:30,993 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
169
+ "2025-05-30 23:53:31,531 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
170
+ "2025-05-30 23:53:32,425 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
171
+ "2025-05-30 23:53:33,118 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
172
+ "2025-05-30 23:53:34,012 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
173
+ "2025-05-30 23:53:34,277 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
174
+ "2025-05-30 23:53:35,156 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
175
+ "2025-05-30 23:53:36,044 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
176
+ "2025-05-30 23:53:36,712 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
177
+ "2025-05-30 23:53:37,534 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
178
+ "2025-05-30 23:53:38,511 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
179
+ "2025-05-30 23:53:39,309 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
180
+ "2025-05-30 23:53:40,211 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
181
+ "2025-05-30 23:53:41,136 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
182
+ "2025-05-30 23:53:42,084 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
183
+ "2025-05-30 23:53:42,790 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
184
+ "2025-05-30 23:53:43,608 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
185
+ "2025-05-30 23:53:44,584 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
186
+ "2025-05-30 23:53:45,088 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
187
+ "2025-05-30 23:53:45,929 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
188
+ "2025-05-30 23:53:46,773 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
189
+ "2025-05-30 23:53:47,723 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
190
+ "2025-05-30 23:53:48,154 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
191
+ "2025-05-30 23:53:48,468 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
192
+ "2025-05-30 23:53:48,700 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
193
+ "2025-05-30 23:53:48,812 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
194
+ "2025-05-30 23:53:49,119 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
195
+ "2025-05-30 23:53:49,471 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
196
+ "2025-05-30 23:53:49,803 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
197
+ "2025-05-30 23:53:50,123 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
198
+ "2025-05-30 23:53:50,446 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
199
+ "2025-05-30 23:53:50,773 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
200
+ "2025-05-30 23:53:51,117 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
201
  ]
202
  },
203
  {
 
221
  "metadata": {},
222
  "outputs": [],
223
  "source": [
224
+ "chain = create_transcript_rag_chain(datastore)"
225
  ]
226
  },
227
  {
 
233
  "name": "stderr",
234
  "output_type": "stream",
235
  "text": [
236
+ "2025-05-30 23:53:51,268 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
237
+ "2025-05-30 23:54:00,302 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/chat \"HTTP/1.1 200 OK\"\n"
238
  ]
239
  }
240
  ],
 
254
  "==================================\u001b[1m Ai Message \u001b[0m==================================\n",
255
  "\n",
256
  "<think>\n",
257
+ "Okay, the user is asking, \"What is a layer?\" and I need to answer based only on the provided transcript excerpts.\n",
 
 
258
  "\n",
259
+ "First, I'll look through the context to find relevant information. The first document's page_content mentions that layers are the building blocks of any image in Photoshop CC. It also compares layers to separate flat pints of glass stacked one on top of another. Each layer has separate content. There's also mention of the Layers panel where you can select and work with layers, and that clicking the Eye icon toggles the visibility of a layer.\n",
260
  "\n",
261
+ "Another document talks about the layers panel being a powerful tool to isolate individual elements, allowing work on individual parts of the document. But that's more about using layers rather than defining what a layer is.\n",
262
  "\n",
263
+ "So, the key points are: layers are building blocks, they're like stacked glass with separate content, and the Layers panel is used to manage them. The timestamp for the first document's explanation is from 3.81 to 9.13, which is 0:03:41 to 0:09:13 in minute:seconds format.\n",
264
  "\n",
265
+ "I need to present this information clearly, using the timestamp. Make sure not to add any extra info not in the transcript. The answer should be concise and directly answer the question without assumptions.\n",
 
 
266
  "</think>\n",
267
  "\n",
268
+ "A **layer** in Photoshop is the building block of any image, allowing you to isolate and manage separate elements within a document. The transcript explains layers as \"separate flat pints of glass, stacked one on top of the other,\" where each layer contains its own content. You can toggle a layer’s visibility using the Eye icon in the **Layers panel**, which lets you control what’s visible in the image. \n",
 
 
 
 
 
269
  "\n",
270
+ "**Timestamp**: 0:03:41–0:09:13 🎨✨\n",
271
  "**REFERENCES**\n",
272
  "[\n",
273
  " {\n",
 
304
  {
305
  "data": {
306
  "text/plain": [
307
+ "[Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 63, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"Layers are the building blocks of any image in Photoshop CC. So, it's important to understand, what layers are and why to use them - which we'll cover in this video. If you're following along, open this layered image from the downloadable practice files for this tutorial. You might think of layers like separate flat pints of glass, stacked one on top of the other. Each layer contains separate pieces of content. To get a sense of how layers are constructed, let's take a look at this Layers panel. I've closed my other panels, so that we can focus on the Layers panel. But you can skip that. By the way: If your Layers panel isn't showing, go up to the Window menu and choose Layers from there. The Layers panel is where you go to select and work with layers. In this image there are 4 layers, each with separate content. If you click the Eye icon to the left of a layer, you can toggle the visibility of that layer off and on. So, I'm going to turn off the visibility of the tailor layer. And keep your eye on the image, so you can see what's on that layer.\"),\n",
308
+ " Document(metadata={'video_id': 4103, 'title': 'Use layers for ultimate flexibility and control', 'desc': 'Learn how to use layers to create designs, fix photos, or build collages.', 'length': '00:05:06.55', 'group': 'data/dev.json', 'source': 'https://videos-tv.adobe.com/2014-09-04/96f51d8958ae31b37cb5a15cbdc21744.mp4', 'speech_start_stop_times': [[0.82, 5.88], [6.51, 18.389999], [19.219999, 30.13]], 'start': 0.82, 'stop': 30.13, '_id': 0, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"As a new Photoshop user, you're going to find that the layers panel is an incredibly powerful tool. The layers panel gives you the ability to isolate individual pieces of content away from the rest of the composition giving you the ability to work on individual elements within the overall document. Now, this can be used for something as literal as some type in this case, or something as subtle as a small brush stroke to add a highlight or shadow to an image.\"),\n",
309
+ " Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[280.4, 284.58]], 'start': 280.4, 'stop': 284.58, '_id': 66, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"There's lots more to learn about layers, so stay tuned for the rest of this tutorial.\")]"
310
  ]
311
  },
312
  "execution_count": 13,
pstuts_rag/pstuts_rag/configuration.py CHANGED
@@ -70,6 +70,8 @@ class Configuration:
70
  if config and "configurable" in config
71
  else {}
72
  )
 
 
73
  values: dict[str, Any] = {
74
  f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
75
  for f in fields(cls)
 
70
  if config and "configurable" in config
71
  else {}
72
  )
73
+ # Map each dataclass field to environment variables or configurable values
74
+ # Priority: environment variables > configurable dict values > field defaults
75
  values: dict[str, Any] = {
76
  f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
77
  for f in fields(cls)
pstuts_rag/pstuts_rag/datastore.py CHANGED
@@ -24,220 +24,7 @@ from qdrant_client import QdrantClient
24
  from qdrant_client.http.models import Distance, VectorParams
25
  from qdrant_client.models import PointStruct
26
 
27
- from pstuts_rag.utils import EmbeddingsAPISelector
28
-
29
-
30
- def batch(iterable: List[Any], size: int = 16) -> Iterator[List[Any]]:
31
- """
32
- Batch an iterable into chunks of specified size.
33
-
34
- Yields successive chunks from the input iterable, each containing
35
- at most 'size' elements. Useful for processing large collections
36
- in manageable batches to avoid memory issues or API rate limits.
37
-
38
- Args:
39
- iterable (List[Any]): The input list to be batched
40
- size (int, optional): Maximum size of each batch. Defaults to 16.
41
-
42
- Yields:
43
- List[Any]: Successive batches of the input iterable
44
-
45
- Example:
46
- >>> list(batch([1, 2, 3, 4, 5], 2))
47
- [[1, 2], [3, 4], [5]]
48
- """
49
- for i in range(0, len(iterable), size):
50
- yield iterable[i : i + size]
51
-
52
-
53
- class VideoTranscriptBulkLoader(BaseLoader):
54
- """
55
- Loads video transcripts as bulk documents for document processing pipelines.
56
-
57
- Each video becomes a single document with all transcript sentences concatenated.
58
- Useful for semantic search across entire video content.
59
-
60
- Inherits from LangChain's BaseLoader for compatibility with document processing chains.
61
-
62
- Attributes:
63
- json_payload (List[Dict]): List of video dictionaries containing transcript data
64
- """
65
-
66
- def __init__(self, json_payload: List[Dict]):
67
- """
68
- Initialize the bulk loader with video transcript data.
69
-
70
- Args:
71
- json_payload (List[Dict]): List of video dictionaries, each containing:
72
- - transcripts: List of transcript segments
73
- - qa: Q&A data (optional)
74
- - url: Video URL
75
- - other metadata fields
76
- """
77
-
78
- self.json_payload = json_payload
79
-
80
- def lazy_load(self) -> Iterator[Document]:
81
- """
82
- Lazy loader that yields Document objects with concatenated transcripts.
83
-
84
- Creates one Document per video with all transcript sentences joined by newlines.
85
- Metadata includes all video fields except 'transcripts' and 'qa'.
86
- The 'url' field is renamed to 'source' for LangChain compatibility.
87
-
88
- Yields:
89
- Document: LangChain Document with page_content as concatenated transcript
90
- and metadata containing video information
91
- """
92
-
93
- for video in self.json_payload:
94
- metadata = dict(video)
95
- metadata.pop("transcripts", None)
96
- metadata.pop("qa", None)
97
- # Rename 'url' key to 'source' in metadata if it exists
98
- if "url" in metadata:
99
- metadata["source"] = metadata.pop("url")
100
- yield Document(
101
- page_content="\n".join(
102
- t["sent"] for t in video["transcripts"]
103
- ),
104
- metadata=metadata,
105
- )
106
-
107
-
108
- class VideoTranscriptChunkLoader(BaseLoader):
109
- """
110
- Loads video transcripts as individual chunk documents for fine-grained processing.
111
-
112
- Each transcript segment becomes a separate document with timing information.
113
- Useful for precise timestamp-based retrieval and time-sensitive queries.
114
-
115
- Inherits from LangChain's BaseLoader for compatibility with document processing chains.
116
-
117
- Attributes:
118
- json_payload (List[Dict]): List of video dictionaries containing transcript data
119
- """
120
-
121
- def __init__(self, json_payload: List[Dict]):
122
- """
123
- Initialize the chunk loader with video transcript data.
124
-
125
- Args:
126
- json_payload (List[Dict]): List of video dictionaries, each containing:
127
- - transcripts: List of transcript segments with timing
128
- - qa: Q&A data (optional)
129
- - url: Video URL
130
- - other metadata fields
131
- """
132
-
133
- self.json_payload = json_payload
134
-
135
- def lazy_load(self) -> Iterator[Document]:
136
- """
137
- Lazy loader that yields individual Document objects for each transcript segment.
138
-
139
- Creates one Document per transcript segment with timing metadata.
140
- Each document contains a single transcript sentence with precise start/end times.
141
- The 'url' field is renamed to 'source' for LangChain compatibility.
142
-
143
- Yields:
144
- Document: LangChain Document with page_content as single transcript sentence
145
- and metadata containing video info plus time_start and time_end
146
- """
147
-
148
- for video in self.json_payload:
149
- metadata = dict(video)
150
- transcripts = metadata.pop("transcripts", None)
151
- metadata.pop("qa", None)
152
- # Rename 'url' key to 'source' in metadata if it exists
153
- if "url" in metadata:
154
- metadata["source"] = metadata.pop("url")
155
- for transcript in transcripts:
156
- yield Document(
157
- page_content=transcript["sent"],
158
- metadata=metadata
159
- | {
160
- "time_start": transcript["begin"],
161
- "time_end": transcript["end"],
162
- },
163
- )
164
-
165
-
166
- async def chunk_transcripts(
167
- json_transcripts: List[Dict[str, Any]],
168
- semantic_chunker_embedding_model: Embeddings = OpenAIEmbeddings(
169
- model="text-embedding-3-small"
170
- ),
171
- ) -> List[Document]:
172
- """
173
- Load and process video transcripts into semantically chunked documents.
174
-
175
- This function takes a list of transcript dictionaries, loads them as both full
176
- transcripts and individual chunks, then applies semantic chunking. It also
177
- enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
178
-
179
- Args:
180
- json_transcripts: List of dictionaries containing video transcript data
181
- embeddings: OpenAI embeddings model to use for semantic chunking
182
-
183
- Returns:
184
- List of semantically chunked Document objects with enhanced metadata
185
- """
186
-
187
- docs_full_transcript: List[Document] = VideoTranscriptBulkLoader(
188
- json_payload=json_transcripts
189
- ).load()
190
- docs_chunks_verbatim: List[Document] = VideoTranscriptChunkLoader(
191
- json_payload=json_transcripts
192
- ).load()
193
-
194
- # semantically split the combined transcript
195
- text_splitter = SemanticChunker(semantic_chunker_embedding_model)
196
- docs_group = await asyncio.gather(
197
- *[
198
- text_splitter.atransform_documents(d)
199
- for d in batch(docs_full_transcript, size=2)
200
- ]
201
- )
202
- # Flatten the nested list of documents
203
- docs_chunks_semantic: List[Document] = []
204
- for group in docs_group:
205
- docs_chunks_semantic.extend(group)
206
-
207
- # Create a lookup dictionary for faster access
208
- video_id_to_chunks: Dict[int, List[Document]] = {}
209
- for chunk in docs_chunks_verbatim:
210
- video_id: int = chunk.metadata["video_id"]
211
- if video_id not in video_id_to_chunks:
212
- video_id_to_chunks[video_id] = []
213
- video_id_to_chunks[video_id].append(chunk)
214
-
215
- for chunk in docs_chunks_semantic:
216
- video_id = chunk.metadata["video_id"]
217
- # Only check chunks from the same video
218
- potential_subchunks = video_id_to_chunks.get(video_id, [])
219
- subchunks = [
220
- c
221
- for c in potential_subchunks
222
- if c.page_content in chunk.page_content
223
- ]
224
-
225
- times = [
226
- (t.metadata["time_start"], t.metadata["time_end"])
227
- for t in subchunks
228
- ]
229
- chunk.metadata["speech_start_stop_times"] = times
230
-
231
- if times: # Avoid IndexError if times is empty
232
- chunk.metadata["start"], chunk.metadata["stop"] = (
233
- times[0][0],
234
- times[-1][-1],
235
- )
236
- else:
237
- chunk.metadata["start"], chunk.metadata["stop"] = None, None
238
-
239
- docs_chunks_semantic[0].metadata.keys()
240
- return docs_chunks_semantic
241
 
242
 
243
  class DatastoreManager:
@@ -315,17 +102,45 @@ class DatastoreManager:
315
 
316
  self.docs = []
317
 
318
- async def from_json_globs(self, globs: List[str]) -> int:
 
 
 
 
 
 
 
 
319
 
 
 
320
  logging.debug("Starting to load files.")
321
- data = await load_json_files(globs)
322
- logging.debug("Received %d JSON files.", len(data))
323
- count = await self.populate_database(data)
 
 
 
 
 
 
 
 
 
 
 
 
324
  logging.debug("Uploaded %d records.", count)
325
 
326
- return count
 
 
 
 
 
 
327
 
328
- async def populate_database(self, raw_docs: List[Dict[str, Any]]) -> int:
329
  """
330
  Populate the vector database with processed video transcript documents.
331
 
@@ -345,12 +160,6 @@ class DatastoreManager:
345
  Exception: If embedding generation or database upload fails
346
  """
347
 
348
- # perform chunking
349
- self.docs: List[Document] = await chunk_transcripts(
350
- json_transcripts=raw_docs,
351
- semantic_chunker_embedding_model=self.embeddings,
352
- )
353
-
354
  # perform embedding
355
 
356
  vector_batches = await asyncio.gather(
@@ -358,7 +167,7 @@ class DatastoreManager:
358
  self.embeddings.aembed_documents(
359
  [c.page_content for c in chunk_batch]
360
  )
361
- for chunk_batch in batch(self.docs, 8)
362
  ]
363
  )
364
  vectors = []
@@ -375,7 +184,7 @@ class DatastoreManager:
375
  "metadata": doc.metadata,
376
  },
377
  )
378
- for id, vector, doc in zip(ids, vectors, self.docs)
379
  ]
380
 
381
  # upload qdrant payload
@@ -384,14 +193,6 @@ class DatastoreManager:
384
  points=points,
385
  )
386
 
387
- self.loading_complete.set()
388
- # Execute callbacks (both sync and async)
389
- for callback in self._completion_callbacks:
390
- if asyncio.iscoroutinefunction(callback):
391
- await callback()
392
- else:
393
- callback()
394
-
395
  return len(points)
396
 
397
  def count_docs(self) -> int:
@@ -479,7 +280,7 @@ class DatastoreManager:
479
  return False
480
 
481
 
482
- async def load_single_json(filepath: str):
483
  """
484
  Asynchronously load and parse a single JSON file containing video data.
485
 
@@ -502,53 +303,164 @@ async def load_single_json(filepath: str):
502
  Note:
503
  Uses async file I/O for better performance when loading multiple files
504
  """
505
- my_path = Path(filepath)
506
 
507
- async with aiofiles.open(my_path, mode="r", encoding="utf-8") as f:
508
  content = await f.read()
509
  payload = json.loads(content)
510
  for entry in payload:
511
- entry.update({"group": str(my_path)})
512
  return payload
513
 
514
 
515
- async def load_json_files(glob_list: List[str]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  """
517
- Asynchronously load and parse multiple JSON files matching given patterns.
518
 
519
- Uses glob patterns to find files and loads them concurrently for optimal performance.
520
- All results are flattened into a single list for unified processing. This function
521
- is designed to handle large datasets efficiently by leveraging async I/O.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
  Args:
524
- glob_list (List[str]): List of glob patterns to match JSON files.
525
- Supports standard glob syntax including recursive
526
- patterns with ** for subdirectory traversal.
527
 
528
  Returns:
529
- List[Dict]: Flattened list of all video dictionaries from matched files,
530
- with each video containing its source group information
531
-
532
- Raises:
533
- FileNotFoundError: If any matched file doesn't exist during loading
534
- json.JSONDecodeError: If any file content is not valid JSON format
535
- PermissionError: If any file cannot be read due to permissions
536
-
537
- Example:
538
- >>> patterns = ["data/*.json", "archive/**/*.json"]
539
- >>> videos = await load_json_files(patterns)
540
- >>> len(videos) # Total videos from all matched files
541
  """
542
- logging.debug("Loading from %d globs:", len(glob_list))
543
 
544
- files = []
545
- for globstring in glob_list:
546
- logging.debug("Loading glob: %s", globstring)
547
- new_files = glob.glob(globstring, recursive=True)
548
- logging.debug("New files: %d", len(new_files))
549
- files.extend(new_files)
550
- logging.debug("Total files: %d", len(files))
551
 
552
- tasks = [load_single_json(f) for f in files]
553
- results = await asyncio.gather(*tasks)
554
- return [item for sublist in results for item in sublist] # flatten
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  from qdrant_client.http.models import Distance, VectorParams
25
  from qdrant_client.models import PointStruct
26
 
27
+ from pstuts_rag.utils import EmbeddingsAPISelector, flatten, batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  class DatastoreManager:
 
102
 
103
  self.docs = []
104
 
105
+ async def from_json_globs(self, globs: List[str] | str) -> int:
106
+ """
107
+ Populate the vector database with processed video transcript documents,
108
+ retrieved from JSON file paths.
109
+
110
+ This method performs the complete pipeline:
111
+ - loading JSON transcripts
112
+ - semantic chunking with timestamp preservation
113
+ -
114
 
115
+
116
+ """
117
  logging.debug("Starting to load files.")
118
+ files = globs_to_paths(globs)
119
+
120
+ tasks = [load_json_file(f) for f in files]
121
+ results = await asyncio.gather(*tasks)
122
+
123
+ json_transcripts = list(flatten(results))
124
+ logging.debug("Received %d JSON files.", len(json_transcripts))
125
+
126
+ # perform chunking
127
+ self.docs: List[Document] = await chunk_transcripts(
128
+ json_transcripts=json_transcripts,
129
+ semantic_chunker_embedding_model=self.embeddings,
130
+ )
131
+
132
+ count = await self.embed_chunks(self.docs)
133
  logging.debug("Uploaded %d records.", count)
134
 
135
+ self.loading_complete.set()
136
+ # Execute callbacks (both sync and async)
137
+ for callback in self._completion_callbacks:
138
+ if asyncio.iscoroutinefunction(callback):
139
+ await callback()
140
+ else:
141
+ callback()
142
 
143
+ async def embed_chunks(self, chunked_documents: List[Document]) -> int:
144
  """
145
  Populate the vector database with processed video transcript documents.
146
 
 
160
  Exception: If embedding generation or database upload fails
161
  """
162
 
 
 
 
 
 
 
163
  # perform embedding
164
 
165
  vector_batches = await asyncio.gather(
 
167
  self.embeddings.aembed_documents(
168
  [c.page_content for c in chunk_batch]
169
  )
170
+ for chunk_batch in batch(chunked_documents, 8)
171
  ]
172
  )
173
  vectors = []
 
184
  "metadata": doc.metadata,
185
  },
186
  )
187
+ for id, vector, doc in zip(ids, vectors, chunked_documents)
188
  ]
189
 
190
  # upload qdrant payload
 
193
  points=points,
194
  )
195
 
 
 
 
 
 
 
 
 
196
  return len(points)
197
 
198
  def count_docs(self) -> int:
 
280
  return False
281
 
282
 
283
+ async def load_json_file(filepath: Path):
284
  """
285
  Asynchronously load and parse a single JSON file containing video data.
286
 
 
303
  Note:
304
  Uses async file I/O for better performance when loading multiple files
305
  """
 
306
 
307
+ async with aiofiles.open(filepath, mode="r", encoding="utf-8") as f:
308
  content = await f.read()
309
  payload = json.loads(content)
310
  for entry in payload:
311
+ entry.update({"group": str(filepath)})
312
  return payload
313
 
314
 
315
+ def globs_to_paths(glob_list: List[str] | str) -> List[Path]:
316
+
317
+ if isinstance(glob_list, str):
318
+ glob_list = glob_list.split(":")
319
+
320
+ logging.debug("Loading from %d globs:", len(glob_list))
321
+
322
+ files: List[Path] = []
323
+ for globstring in glob_list:
324
+ logging.debug("Loading glob: %s", globstring)
325
+ new_files = [Path(f) for f in glob.glob(globstring, recursive=True)]
326
+ files.extend(filter(lambda f: f.exists(), new_files))
327
+
328
+ logging.debug("Total files: %d", len(files))
329
+
330
+ return files
331
+
332
+
333
+ def load_transcripts_whole(json_payload: List[Dict]) -> Iterator[Document]:
334
+ """
335
+ Lazy loader that yields Document objects with concatenated transcripts.
336
+
337
+ Creates one Document per video with all transcript sentences joined by newlines.
338
+ Metadata includes all video fields except 'transcripts' and 'qa'.
339
+ The 'url' field is renamed to 'source' for LangChain compatibility.
340
+
341
+ Yields:
342
+ Document: LangChain Document with page_content as concatenated transcript
343
+ and metadata containing video information
344
+ """
345
+
346
+ for video in json_payload:
347
+ metadata = dict(video)
348
+ metadata.pop("transcripts", None)
349
+ metadata.pop("qa", None)
350
+ # Rename 'url' key to 'source' in metadata if it exists
351
+ if "url" in metadata:
352
+ metadata["source"] = metadata.pop("url")
353
+ yield Document(
354
+ page_content="\n".join(t["sent"] for t in video["transcripts"]),
355
+ metadata=metadata,
356
+ )
357
+
358
+
359
+ def load_transcripts_segments(
360
+ json_payload: List[Dict],
361
+ ) -> Iterator[Document]:
362
  """
363
+ Lazy loader that yields individual Document objects for each transcript segment.
364
 
365
+ Creates one Document per transcript segment with timing metadata.
366
+ Each document contains a single transcript sentence with precise start/end times.
367
+ The 'url' field is renamed to 'source' for LangChain compatibility.
368
+
369
+ Yields:
370
+ Document: LangChain Document with page_content as single transcript sentence
371
+ and metadata containing video info plus time_start and time_end
372
+ """
373
+
374
+ for video in json_payload:
375
+ metadata = dict(video)
376
+ transcripts = metadata.pop("transcripts", None)
377
+ metadata.pop("qa", None)
378
+ # Rename 'url' key to 'source' in metadata if it exists
379
+ if "url" in metadata:
380
+ metadata["source"] = metadata.pop("url")
381
+ for transcript in transcripts:
382
+ yield Document(
383
+ page_content=transcript["sent"],
384
+ metadata=metadata
385
+ | {
386
+ "time_start": transcript["begin"],
387
+ "time_end": transcript["end"],
388
+ },
389
+ )
390
+
391
+
392
+ async def chunk_transcripts(
393
+ json_transcripts: List[Dict[str, Any]],
394
+ semantic_chunker_embedding_model: Embeddings = OpenAIEmbeddings(
395
+ model="text-embedding-3-small"
396
+ ),
397
+ ) -> List[Document]:
398
+ """
399
+ Load and process video transcripts into semantically chunked documents.
400
+
401
+ This function takes a list of transcript dictionaries, loads them as both full
402
+ transcripts and individual chunks, then applies semantic chunking. It also
403
+ enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
404
 
405
  Args:
406
+ json_transcripts: List of dictionaries containing video transcript data
407
+ embeddings: OpenAI embeddings model to use for semantic chunking
 
408
 
409
  Returns:
410
+ List of semantically chunked Document objects with enhanced metadata
 
 
 
 
 
 
 
 
 
 
 
411
  """
 
412
 
413
+ docs_full_transcript: List[Document] = list(
414
+ load_transcripts_whole(json_transcripts)
415
+ )
416
+ docs_chunks_verbatim: List[Document] = list(
417
+ load_transcripts_segments(json_transcripts)
418
+ )
 
419
 
420
+ # semantically split the combined transcript
421
+ text_splitter = SemanticChunker(semantic_chunker_embedding_model)
422
+ docs_group = await asyncio.gather(
423
+ *[
424
+ text_splitter.atransform_documents(d)
425
+ for d in batch(docs_full_transcript, size=2)
426
+ ]
427
+ )
428
+ # Flatten the nested list of documents
429
+ docs_chunks_semantic: List[Document] = []
430
+ for group in docs_group:
431
+ docs_chunks_semantic.extend(group)
432
+
433
+ # Create a lookup dictionary for faster access
434
+ video_id_to_chunks: Dict[int, List[Document]] = {}
435
+ for chunk in docs_chunks_verbatim:
436
+ video_id: int = chunk.metadata["video_id"]
437
+ if video_id not in video_id_to_chunks:
438
+ video_id_to_chunks[video_id] = []
439
+ video_id_to_chunks[video_id].append(chunk)
440
+
441
+ for chunk in docs_chunks_semantic:
442
+ video_id = chunk.metadata["video_id"]
443
+ # Only check chunks from the same video
444
+ potential_subchunks = video_id_to_chunks.get(video_id, [])
445
+ subchunks = [
446
+ c
447
+ for c in potential_subchunks
448
+ if c.page_content in chunk.page_content
449
+ ]
450
+
451
+ times = [
452
+ (t.metadata["time_start"], t.metadata["time_end"])
453
+ for t in subchunks
454
+ ]
455
+ chunk.metadata["speech_start_stop_times"] = times
456
+
457
+ if times: # Avoid IndexError if times is empty
458
+ chunk.metadata["start"], chunk.metadata["stop"] = (
459
+ times[0][0],
460
+ times[-1][-1],
461
+ )
462
+ else:
463
+ chunk.metadata["start"], chunk.metadata["stop"] = None, None
464
+
465
+ docs_chunks_semantic[0].metadata.keys()
466
+ return docs_chunks_semantic
pstuts_rag/pstuts_rag/graph.py CHANGED
@@ -30,7 +30,7 @@ from app import (
30
  enter_chain,
31
  )
32
 
33
- from pstuts_rag.rag_for_transcripts import retrieve_videos
34
 
35
 
36
  def search_agent(state: PsTutsTeamState, chain: Runnable) -> Dict:
@@ -244,7 +244,7 @@ async def build_the_graph(current_state: ApplicationState):
244
  )
245
 
246
  rag_node, _ = create_rag_node(
247
- rag_chain=retrieve_videos(),
248
  name=VIDEOARCHIVE,
249
  )
250
 
 
30
  enter_chain,
31
  )
32
 
33
+ from pstuts_rag.rag_for_transcripts import create_transcript_rag_chain
34
 
35
 
36
  def search_agent(state: PsTutsTeamState, chain: Runnable) -> Dict:
 
244
  )
245
 
246
  rag_node, _ = create_rag_node(
247
+ rag_chain=create_transcript_rag_chain(),
248
  name=VIDEOARCHIVE,
249
  )
250
 
pstuts_rag/pstuts_rag/rag.py CHANGED
@@ -248,7 +248,7 @@ class RAGChainInstance:
248
  qdrant_client=self.qdrant_client, name=self.name
249
  )
250
  if self.datastore_manager.count_docs() == 0:
251
- self.pointsLoaded = await self.datastore_manager.populate_database(
252
  raw_docs=json_payload
253
  )
254
  logging.info(
 
248
  qdrant_client=self.qdrant_client, name=self.name
249
  )
250
  if self.datastore_manager.count_docs() == 0:
251
+ self.pointsLoaded = await self.datastore_manager.embed_chunks(
252
  raw_docs=json_payload
253
  )
254
  logging.info(
pstuts_rag/pstuts_rag/rag_for_transcripts.py CHANGED
@@ -18,7 +18,7 @@ from langchain_ollama import ChatOllama
18
 
19
  from .datastore import DatastoreManager
20
  from .prompts import RAG_PROMPT_TEMPLATES
21
-
22
  from pstuts_rag.configuration import Configuration, ModelAPI
23
 
24
 
@@ -37,6 +37,7 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
37
  answer: AIMessage = msg_dict["answer"]
38
  input = msg_dict["input"]
39
 
 
40
  reference_dicts = [
41
  {k: doc.metadata[k] for k in ("title", "source", "start", "stop")}
42
  for doc in input["context"]
@@ -44,11 +45,13 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
44
  references = str(json.dumps(reference_dicts, indent=2))
45
 
46
  text_w_references = answer.content
 
47
  if "I don't know" not in answer.content:
48
  text_w_references = "\n".join(
49
  [str(text_w_references), "**REFERENCES**", references]
50
  )
51
 
 
52
  output: AIMessage = answer.model_copy(
53
  update={
54
  "content": text_w_references,
@@ -63,88 +66,60 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
63
  return output
64
 
65
 
66
- def retrieve_videos(
67
  datastore: DatastoreManager,
68
  config: Union[RunnableConfig, Configuration] = Configuration(),
69
  ) -> Runnable:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
 
71
  configurable = (
72
  config
73
  if isinstance(config, Configuration)
74
  else Configuration.from_runnable_config(config)
75
  )
76
 
77
- cls = {
78
- ModelAPI.HUGGINGFACE: ChatHuggingFace,
79
- ModelAPI.OPENAI: ChatOpenAI,
80
- ModelAPI.OLLAMA: ChatOllama,
81
- }.get(configurable.llm_api, ChatOpenAI)
82
 
83
  llm = cls(model=configurable.llm_tool_model)
84
 
 
85
  answer_chain = (
86
  ChatPromptTemplate.from_messages(list(RAG_PROMPT_TEMPLATES.items()))
87
  | llm
88
  )
89
 
 
 
90
  rag_chain = (
91
- itemgetter("question")
92
- | RunnableParallel(
93
  context=datastore.get_retriever(
94
  n_context_docs=configurable.n_context_docs
95
  ),
96
- question=RunnablePassthrough(),
97
  )
98
- | {
99
- "input": RunnablePassthrough(),
100
- "answer": answer_chain,
101
  }
102
- | pack_references
103
  )
104
 
105
  return rag_chain
106
-
107
-
108
- def startup(
109
- config=Configuration(),
110
- callback_on_loading_complete: Optional[Callable] = None,
111
- ):
112
- """
113
- Initialize the application with optional loading completion callback.
114
-
115
- Args:
116
- config: Configuration object with application settings
117
- on_loading_complete: Optional callback (sync or async) to call when
118
- datastore loading completes
119
-
120
- Returns:
121
- DatastoreManager: The initialized datastore manager
122
- """
123
-
124
- ### PROCESS THE CONFIGURATION
125
- log_level = getattr(logging, config.eva_log_level, logging.INFO)
126
- logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")
127
-
128
- ### CREATE THE DATABASE
129
-
130
- datastore = DatastoreManager()
131
- if callback_on_loading_complete:
132
- datastore.add_completion_callback(callback_on_loading_complete)
133
-
134
- ### START DATABASE POPULATION
135
-
136
- globs = [str(g) for g in config.transcript_glob.split(":")]
137
-
138
- # # Add custom callback if provided, otherwise use default logging
139
- # if on_loading_complete:
140
- # datastore.add_completion_callback(on_loading_complete)
141
- # else:
142
- # # Default callback for logging
143
- # def default_logging_callback():
144
- # logging.info("🎉 Datastore loading completed!")
145
-
146
- # datastore.add_completion_callback(default_logging_callback)
147
-
148
- asyncio.create_task(datastore.from_json_globs(globs))
149
-
150
- return datastore
 
18
 
19
  from .datastore import DatastoreManager
20
  from .prompts import RAG_PROMPT_TEMPLATES
21
+ from pstuts_rag.utils import ChatAPISelector
22
  from pstuts_rag.configuration import Configuration, ModelAPI
23
 
24
 
 
37
  answer: AIMessage = msg_dict["answer"]
38
  input = msg_dict["input"]
39
 
40
+ # Extract relevant metadata from each document in the context
41
  reference_dicts = [
42
  {k: doc.metadata[k] for k in ("title", "source", "start", "stop")}
43
  for doc in input["context"]
 
45
  references = str(json.dumps(reference_dicts, indent=2))
46
 
47
  text_w_references = answer.content
48
+ # Only append references if the model provided a substantive answer
49
  if "I don't know" not in answer.content:
50
  text_w_references = "\n".join(
51
  [str(text_w_references), "**REFERENCES**", references]
52
  )
53
 
54
+ # Create new message with references and preserve original context metadata
55
  output: AIMessage = answer.model_copy(
56
  update={
57
  "content": text_w_references,
 
66
  return output
67
 
68
 
69
+ def create_transcript_rag_chain(
70
  datastore: DatastoreManager,
71
  config: Union[RunnableConfig, Configuration] = Configuration(),
72
  ) -> Runnable:
73
+ """Create a Retrieval-Augmented Generation (RAG) chain for video transcript search.
74
+
75
+ This function constructs a complete RAG pipeline that:
76
+ 1. Takes a user question as input
77
+ 2. Retrieves relevant video transcript chunks from the datastore
78
+ 3. Generates an answer using an LLM with the retrieved context
79
+ 4. Packages the response with reference information
80
+
81
+ Args:
82
+ datastore: The DatastoreManager containing video transcript embeddings
83
+ config: Configuration object or RunnableConfig with model and retrieval settings
84
+
85
+ Returns:
86
+ Runnable: A LangChain runnable that processes questions and returns
87
+ answers with embedded references to source video segments
88
+ """
89
 
90
+ # Handle both Configuration objects and RunnableConfig dictionaries
91
  configurable = (
92
  config
93
  if isinstance(config, Configuration)
94
  else Configuration.from_runnable_config(config)
95
  )
96
 
97
+ # Select the appropriate chat model class based on configuration
98
+ cls = ChatAPISelector.get(configurable.llm_api, ChatOpenAI)
 
 
 
99
 
100
  llm = cls(model=configurable.llm_tool_model)
101
 
102
+ # Create the answer generation chain using prompt templates
103
  answer_chain = (
104
  ChatPromptTemplate.from_messages(list(RAG_PROMPT_TEMPLATES.items()))
105
  | llm
106
  )
107
 
108
+ # Build the complete RAG chain with the following flow:
109
+ # question -> parallel(context_retrieval, question_passthrough) -> llm_answer -> pack_references
110
  rag_chain = (
111
+ itemgetter("question") # Extract question from input dict
112
+ | RunnableParallel( # Run context retrieval and question passing in parallel
113
  context=datastore.get_retriever(
114
  n_context_docs=configurable.n_context_docs
115
  ),
116
+ question=RunnablePassthrough(), # Pass question unchanged
117
  )
118
+ | { # Prepare input dict for final processing
119
+ "input": RunnablePassthrough(), # Contains both context and question
120
+ "answer": answer_chain, # Generate answer using retrieved context
121
  }
122
+ | pack_references # Add reference metadata to the final response
123
  )
124
 
125
  return rag_chain