Spaces:
Sleeping
Sleeping
dev: simplifying datastore functions
Browse files- app_simple_rag.py +1 -1
- notebooks/transcript_rag.ipynb +78 -69
- pstuts_rag/pstuts_rag/configuration.py +2 -0
- pstuts_rag/pstuts_rag/datastore.py +183 -271
- pstuts_rag/pstuts_rag/graph.py +2 -2
- pstuts_rag/pstuts_rag/rag.py +1 -1
- pstuts_rag/pstuts_rag/rag_for_transcripts.py +34 -59
app_simple_rag.py
CHANGED
|
@@ -59,7 +59,7 @@ params = ApplicationParameters()
|
|
| 59 |
async def fill_the_db():
|
| 60 |
if state.datastore_manager.count_docs() == 0:
|
| 61 |
data: List[Dict[str, Any]] = await load_json_files(params.filename)
|
| 62 |
-
state.pointsLoaded = await state.datastore_manager.
|
| 63 |
raw_docs=data
|
| 64 |
)
|
| 65 |
await cl.Message(
|
|
|
|
| 59 |
async def fill_the_db():
|
| 60 |
if state.datastore_manager.count_docs() == 0:
|
| 61 |
data: List[Dict[str, Any]] = await load_json_files(params.filename)
|
| 62 |
+
state.pointsLoaded = await state.datastore_manager.embed_chunks(
|
| 63 |
raw_docs=data
|
| 64 |
)
|
| 65 |
await cl.Message(
|
notebooks/transcript_rag.ipynb
CHANGED
|
@@ -86,7 +86,7 @@
|
|
| 86 |
"name": "stderr",
|
| 87 |
"output_type": "stream",
|
| 88 |
"text": [
|
| 89 |
-
"2025-05-30
|
| 90 |
]
|
| 91 |
}
|
| 92 |
],
|
|
@@ -103,16 +103,22 @@
|
|
| 103 |
"name": "stderr",
|
| 104 |
"output_type": "stream",
|
| 105 |
"text": [
|
| 106 |
-
"2025-05-30
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
"2025-05-30
|
| 114 |
-
"2025-05-30
|
| 115 |
-
"2025-05-30
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
]
|
| 117 |
}
|
| 118 |
],
|
|
@@ -129,13 +135,25 @@
|
|
| 129 |
"name": "stderr",
|
| 130 |
"output_type": "stream",
|
| 131 |
"text": [
|
| 132 |
-
"2025-05-30
|
| 133 |
-
"2025-05-30
|
| 134 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
}
|
| 136 |
],
|
| 137 |
"source": [
|
| 138 |
-
"datastore
|
|
|
|
|
|
|
| 139 |
]
|
| 140 |
},
|
| 141 |
{
|
|
@@ -147,39 +165,39 @@
|
|
| 147 |
"name": "stderr",
|
| 148 |
"output_type": "stream",
|
| 149 |
"text": [
|
| 150 |
-
"2025-05-30
|
| 151 |
-
"2025-05-30
|
| 152 |
-
"2025-05-30
|
| 153 |
-
"2025-05-30
|
| 154 |
-
"2025-05-30
|
| 155 |
-
"2025-05-30
|
| 156 |
-
"2025-05-30
|
| 157 |
-
"2025-05-30
|
| 158 |
-
"2025-05-30
|
| 159 |
-
"2025-05-30
|
| 160 |
-
"2025-05-30
|
| 161 |
-
"2025-05-30
|
| 162 |
-
"2025-05-30
|
| 163 |
-
"2025-05-30
|
| 164 |
-
"2025-05-30
|
| 165 |
-
"2025-05-30
|
| 166 |
-
"2025-05-30
|
| 167 |
-
"2025-05-30
|
| 168 |
-
"2025-05-30
|
| 169 |
-
"2025-05-30
|
| 170 |
-
"2025-05-30
|
| 171 |
-
"2025-05-30
|
| 172 |
-
"2025-05-30
|
| 173 |
-
"2025-05-30
|
| 174 |
-
"2025-05-30
|
| 175 |
-
"2025-05-30
|
| 176 |
-
"2025-05-30
|
| 177 |
-
"2025-05-30
|
| 178 |
-
"2025-05-30
|
| 179 |
-
"2025-05-30
|
| 180 |
-
"2025-05-30
|
| 181 |
-
"2025-05-30
|
| 182 |
-
"2025-05-30
|
| 183 |
]
|
| 184 |
},
|
| 185 |
{
|
|
@@ -203,7 +221,7 @@
|
|
| 203 |
"metadata": {},
|
| 204 |
"outputs": [],
|
| 205 |
"source": [
|
| 206 |
-
"chain =
|
| 207 |
]
|
| 208 |
},
|
| 209 |
{
|
|
@@ -215,8 +233,8 @@
|
|
| 215 |
"name": "stderr",
|
| 216 |
"output_type": "stream",
|
| 217 |
"text": [
|
| 218 |
-
"2025-05-30
|
| 219 |
-
"2025-05-30
|
| 220 |
]
|
| 221 |
}
|
| 222 |
],
|
|
@@ -236,29 +254,20 @@
|
|
| 236 |
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
|
| 237 |
"\n",
|
| 238 |
"<think>\n",
|
| 239 |
-
"Okay,
|
| 240 |
-
"\n",
|
| 241 |
-
"First, looking at the first document with video ID 19172. The title says \"Understand layers\" and the description mentions that layers are the building blocks of any image in Photoshop CC. So, layers must be fundamental components.\n",
|
| 242 |
"\n",
|
| 243 |
-
"
|
| 244 |
"\n",
|
| 245 |
-
"
|
| 246 |
"\n",
|
| 247 |
-
"
|
| 248 |
"\n",
|
| 249 |
-
"
|
| 250 |
-
"\n",
|
| 251 |
-
"Putting this together, a layer is like a separate sheet in an image that holds different elements. Each layer allows you to edit, move, or manipulate specific parts without affecting others. The Layers panel helps manage and control these layers for better organization and editing flexibility.\n",
|
| 252 |
"</think>\n",
|
| 253 |
"\n",
|
| 254 |
-
"
|
| 255 |
-
"Layers are like separate sheets in an image, each holding distinct content. Think of them as individual elements stacked on top of each other, allowing you to edit or manipulate specific parts without affecting others. \n",
|
| 256 |
-
"\n",
|
| 257 |
-
"For example: \n",
|
| 258 |
-
"- Each layer can contain text, images, or design elements. \n",
|
| 259 |
-
"- You can toggle their visibility using the Eye icon. \n",
|
| 260 |
"\n",
|
| 261 |
-
"
|
| 262 |
"**REFERENCES**\n",
|
| 263 |
"[\n",
|
| 264 |
" {\n",
|
|
@@ -295,9 +304,9 @@
|
|
| 295 |
{
|
| 296 |
"data": {
|
| 297 |
"text/plain": [
|
| 298 |
-
"[Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 63, '_collection_name': '
|
| 299 |
-
" Document(metadata={'video_id': 4103, 'title': 'Use layers for ultimate flexibility and control', 'desc': 'Learn how to use layers to create designs, fix photos, or build collages.', 'length': '00:05:06.55', 'group': 'data/dev.json', 'source': 'https://videos-tv.adobe.com/2014-09-04/96f51d8958ae31b37cb5a15cbdc21744.mp4', 'speech_start_stop_times': [[0.82, 5.88], [6.51, 18.389999], [19.219999, 30.13]], 'start': 0.82, 'stop': 30.13, '_id': 0, '_collection_name': '
|
| 300 |
-
" Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[280.4, 284.58]], 'start': 280.4, 'stop': 284.58, '_id': 66, '_collection_name': '
|
| 301 |
]
|
| 302 |
},
|
| 303 |
"execution_count": 13,
|
|
|
|
| 86 |
"name": "stderr",
|
| 87 |
"output_type": "stream",
|
| 88 |
"text": [
|
| 89 |
+
"2025-05-30 23:53:26,776 - INFO - <module> - Loaded .env file\n"
|
| 90 |
]
|
| 91 |
}
|
| 92 |
],
|
|
|
|
| 103 |
"name": "stderr",
|
| 104 |
"output_type": "stream",
|
| 105 |
"text": [
|
| 106 |
+
"2025-05-30 23:53:28,613 - INFO - print - Configuration parameters:\n"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"name": "stderr",
|
| 111 |
+
"output_type": "stream",
|
| 112 |
+
"text": [
|
| 113 |
+
"2025-05-30 23:53:28,614 - INFO - print - eva_workflow_name: EVA_workflow\n",
|
| 114 |
+
"2025-05-30 23:53:28,615 - INFO - print - eva_log_level: INFO\n",
|
| 115 |
+
"2025-05-30 23:53:28,615 - INFO - print - transcript_glob: ./data/dev.json:./data/test.json\n",
|
| 116 |
+
"2025-05-30 23:53:28,616 - INFO - print - embedding_model: mxbai-embed-large\n",
|
| 117 |
+
"2025-05-30 23:53:28,617 - INFO - print - embedding_api: ModelAPI.OLLAMA\n",
|
| 118 |
+
"2025-05-30 23:53:28,617 - INFO - print - llm_api: ModelAPI.OLLAMA\n",
|
| 119 |
+
"2025-05-30 23:53:28,618 - INFO - print - max_research_loops: 2\n",
|
| 120 |
+
"2025-05-30 23:53:28,619 - INFO - print - llm_tool_model: qwen3:4b\n",
|
| 121 |
+
"2025-05-30 23:53:28,620 - INFO - print - n_context_docs: 3\n"
|
| 122 |
]
|
| 123 |
}
|
| 124 |
],
|
|
|
|
| 135 |
"name": "stderr",
|
| 136 |
"output_type": "stream",
|
| 137 |
"text": [
|
| 138 |
+
"2025-05-30 23:53:29,748 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 139 |
+
"2025-05-30 23:53:29,781 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
|
| 140 |
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"data": {
|
| 144 |
+
"text/plain": [
|
| 145 |
+
"<Task pending name='Task-1' coro=<DatastoreManager.from_json_globs() running at /home/mbudisic/Documents/PsTuts-RAG/pstuts_rag/pstuts_rag/datastore.py:105>>"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
"execution_count": 8,
|
| 149 |
+
"metadata": {},
|
| 150 |
+
"output_type": "execute_result"
|
| 151 |
}
|
| 152 |
],
|
| 153 |
"source": [
|
| 154 |
+
"datastore = DatastoreManager()\n",
|
| 155 |
+
"datastore.add_completion_callback(lambda _: logging.warning(\"Loading complete.\"))\n",
|
| 156 |
+
"asyncio.create_task(datastore.from_json_globs(Configuration().transcript_glob))"
|
| 157 |
]
|
| 158 |
},
|
| 159 |
{
|
|
|
|
| 165 |
"name": "stderr",
|
| 166 |
"output_type": "stream",
|
| 167 |
"text": [
|
| 168 |
+
"2025-05-30 23:53:30,993 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 169 |
+
"2025-05-30 23:53:31,531 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 170 |
+
"2025-05-30 23:53:32,425 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 171 |
+
"2025-05-30 23:53:33,118 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 172 |
+
"2025-05-30 23:53:34,012 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 173 |
+
"2025-05-30 23:53:34,277 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 174 |
+
"2025-05-30 23:53:35,156 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 175 |
+
"2025-05-30 23:53:36,044 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 176 |
+
"2025-05-30 23:53:36,712 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 177 |
+
"2025-05-30 23:53:37,534 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 178 |
+
"2025-05-30 23:53:38,511 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 179 |
+
"2025-05-30 23:53:39,309 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 180 |
+
"2025-05-30 23:53:40,211 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 181 |
+
"2025-05-30 23:53:41,136 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 182 |
+
"2025-05-30 23:53:42,084 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 183 |
+
"2025-05-30 23:53:42,790 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 184 |
+
"2025-05-30 23:53:43,608 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 185 |
+
"2025-05-30 23:53:44,584 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 186 |
+
"2025-05-30 23:53:45,088 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 187 |
+
"2025-05-30 23:53:45,929 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 188 |
+
"2025-05-30 23:53:46,773 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 189 |
+
"2025-05-30 23:53:47,723 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 190 |
+
"2025-05-30 23:53:48,154 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 191 |
+
"2025-05-30 23:53:48,468 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 192 |
+
"2025-05-30 23:53:48,700 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 193 |
+
"2025-05-30 23:53:48,812 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 194 |
+
"2025-05-30 23:53:49,119 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 195 |
+
"2025-05-30 23:53:49,471 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 196 |
+
"2025-05-30 23:53:49,803 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 197 |
+
"2025-05-30 23:53:50,123 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 198 |
+
"2025-05-30 23:53:50,446 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 199 |
+
"2025-05-30 23:53:50,773 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 200 |
+
"2025-05-30 23:53:51,117 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
|
| 201 |
]
|
| 202 |
},
|
| 203 |
{
|
|
|
|
| 221 |
"metadata": {},
|
| 222 |
"outputs": [],
|
| 223 |
"source": [
|
| 224 |
+
"chain = create_transcript_rag_chain(datastore)"
|
| 225 |
]
|
| 226 |
},
|
| 227 |
{
|
|
|
|
| 233 |
"name": "stderr",
|
| 234 |
"output_type": "stream",
|
| 235 |
"text": [
|
| 236 |
+
"2025-05-30 23:53:51,268 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
|
| 237 |
+
"2025-05-30 23:54:00,302 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/chat \"HTTP/1.1 200 OK\"\n"
|
| 238 |
]
|
| 239 |
}
|
| 240 |
],
|
|
|
|
| 254 |
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
|
| 255 |
"\n",
|
| 256 |
"<think>\n",
|
| 257 |
+
"Okay, the user is asking, \"What is a layer?\" and I need to answer based only on the provided transcript excerpts.\n",
|
|
|
|
|
|
|
| 258 |
"\n",
|
| 259 |
+
"First, I'll look through the context to find relevant information. The first document's page_content mentions that layers are the building blocks of any image in Photoshop CC. It also compares layers to separate flat pints of glass stacked one on top of another. Each layer has separate content. There's also mention of the Layers panel where you can select and work with layers, and that clicking the Eye icon toggles the visibility of a layer.\n",
|
| 260 |
"\n",
|
| 261 |
+
"Another document talks about the layers panel being a powerful tool to isolate individual elements, allowing work on individual parts of the document. But that's more about using layers rather than defining what a layer is.\n",
|
| 262 |
"\n",
|
| 263 |
+
"So, the key points are: layers are building blocks, they're like stacked glass with separate content, and the Layers panel is used to manage them. The timestamp for the first document's explanation is from 3.81 to 9.13, which is 0:03:41 to 0:09:13 in minute:seconds format.\n",
|
| 264 |
"\n",
|
| 265 |
+
"I need to present this information clearly, using the timestamp. Make sure not to add any extra info not in the transcript. The answer should be concise and directly answer the question without assumptions.\n",
|
|
|
|
|
|
|
| 266 |
"</think>\n",
|
| 267 |
"\n",
|
| 268 |
+
"A **layer** in Photoshop is the building block of any image, allowing you to isolate and manage separate elements within a document. The transcript explains layers as \"separate flat pints of glass, stacked one on top of the other,\" where each layer contains its own content. You can toggle a layer’s visibility using the Eye icon in the **Layers panel**, which lets you control what’s visible in the image. \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
"\n",
|
| 270 |
+
"**Timestamp**: 0:03:41–0:09:13 🎨✨\n",
|
| 271 |
"**REFERENCES**\n",
|
| 272 |
"[\n",
|
| 273 |
" {\n",
|
|
|
|
| 304 |
{
|
| 305 |
"data": {
|
| 306 |
"text/plain": [
|
| 307 |
+
"[Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 63, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"Layers are the building blocks of any image in Photoshop CC. So, it's important to understand, what layers are and why to use them - which we'll cover in this video. If you're following along, open this layered image from the downloadable practice files for this tutorial. You might think of layers like separate flat pints of glass, stacked one on top of the other. Each layer contains separate pieces of content. To get a sense of how layers are constructed, let's take a look at this Layers panel. I've closed my other panels, so that we can focus on the Layers panel. But you can skip that. By the way: If your Layers panel isn't showing, go up to the Window menu and choose Layers from there. The Layers panel is where you go to select and work with layers. In this image there are 4 layers, each with separate content. If you click the Eye icon to the left of a layer, you can toggle the visibility of that layer off and on. So, I'm going to turn off the visibility of the tailor layer. And keep your eye on the image, so you can see what's on that layer.\"),\n",
|
| 308 |
+
" Document(metadata={'video_id': 4103, 'title': 'Use layers for ultimate flexibility and control', 'desc': 'Learn how to use layers to create designs, fix photos, or build collages.', 'length': '00:05:06.55', 'group': 'data/dev.json', 'source': 'https://videos-tv.adobe.com/2014-09-04/96f51d8958ae31b37cb5a15cbdc21744.mp4', 'speech_start_stop_times': [[0.82, 5.88], [6.51, 18.389999], [19.219999, 30.13]], 'start': 0.82, 'stop': 30.13, '_id': 0, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"As a new Photoshop user, you're going to find that the layers panel is an incredibly powerful tool. The layers panel gives you the ability to isolate individual pieces of content away from the rest of the composition giving you the ability to work on individual elements within the overall document. Now, this can be used for something as literal as some type in this case, or something as subtle as a small brush stroke to add a highlight or shadow to an image.\"),\n",
|
| 309 |
+
" Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[280.4, 284.58]], 'start': 280.4, 'stop': 284.58, '_id': 66, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"There's lots more to learn about layers, so stay tuned for the rest of this tutorial.\")]"
|
| 310 |
]
|
| 311 |
},
|
| 312 |
"execution_count": 13,
|
pstuts_rag/pstuts_rag/configuration.py
CHANGED
|
@@ -70,6 +70,8 @@ class Configuration:
|
|
| 70 |
if config and "configurable" in config
|
| 71 |
else {}
|
| 72 |
)
|
|
|
|
|
|
|
| 73 |
values: dict[str, Any] = {
|
| 74 |
f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
|
| 75 |
for f in fields(cls)
|
|
|
|
| 70 |
if config and "configurable" in config
|
| 71 |
else {}
|
| 72 |
)
|
| 73 |
+
# Map each dataclass field to environment variables or configurable values
|
| 74 |
+
# Priority: environment variables > configurable dict values > field defaults
|
| 75 |
values: dict[str, Any] = {
|
| 76 |
f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
|
| 77 |
for f in fields(cls)
|
pstuts_rag/pstuts_rag/datastore.py
CHANGED
|
@@ -24,220 +24,7 @@ from qdrant_client import QdrantClient
|
|
| 24 |
from qdrant_client.http.models import Distance, VectorParams
|
| 25 |
from qdrant_client.models import PointStruct
|
| 26 |
|
| 27 |
-
from pstuts_rag.utils import EmbeddingsAPISelector
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def batch(iterable: List[Any], size: int = 16) -> Iterator[List[Any]]:
|
| 31 |
-
"""
|
| 32 |
-
Batch an iterable into chunks of specified size.
|
| 33 |
-
|
| 34 |
-
Yields successive chunks from the input iterable, each containing
|
| 35 |
-
at most 'size' elements. Useful for processing large collections
|
| 36 |
-
in manageable batches to avoid memory issues or API rate limits.
|
| 37 |
-
|
| 38 |
-
Args:
|
| 39 |
-
iterable (List[Any]): The input list to be batched
|
| 40 |
-
size (int, optional): Maximum size of each batch. Defaults to 16.
|
| 41 |
-
|
| 42 |
-
Yields:
|
| 43 |
-
List[Any]: Successive batches of the input iterable
|
| 44 |
-
|
| 45 |
-
Example:
|
| 46 |
-
>>> list(batch([1, 2, 3, 4, 5], 2))
|
| 47 |
-
[[1, 2], [3, 4], [5]]
|
| 48 |
-
"""
|
| 49 |
-
for i in range(0, len(iterable), size):
|
| 50 |
-
yield iterable[i : i + size]
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
class VideoTranscriptBulkLoader(BaseLoader):
|
| 54 |
-
"""
|
| 55 |
-
Loads video transcripts as bulk documents for document processing pipelines.
|
| 56 |
-
|
| 57 |
-
Each video becomes a single document with all transcript sentences concatenated.
|
| 58 |
-
Useful for semantic search across entire video content.
|
| 59 |
-
|
| 60 |
-
Inherits from LangChain's BaseLoader for compatibility with document processing chains.
|
| 61 |
-
|
| 62 |
-
Attributes:
|
| 63 |
-
json_payload (List[Dict]): List of video dictionaries containing transcript data
|
| 64 |
-
"""
|
| 65 |
-
|
| 66 |
-
def __init__(self, json_payload: List[Dict]):
|
| 67 |
-
"""
|
| 68 |
-
Initialize the bulk loader with video transcript data.
|
| 69 |
-
|
| 70 |
-
Args:
|
| 71 |
-
json_payload (List[Dict]): List of video dictionaries, each containing:
|
| 72 |
-
- transcripts: List of transcript segments
|
| 73 |
-
- qa: Q&A data (optional)
|
| 74 |
-
- url: Video URL
|
| 75 |
-
- other metadata fields
|
| 76 |
-
"""
|
| 77 |
-
|
| 78 |
-
self.json_payload = json_payload
|
| 79 |
-
|
| 80 |
-
def lazy_load(self) -> Iterator[Document]:
|
| 81 |
-
"""
|
| 82 |
-
Lazy loader that yields Document objects with concatenated transcripts.
|
| 83 |
-
|
| 84 |
-
Creates one Document per video with all transcript sentences joined by newlines.
|
| 85 |
-
Metadata includes all video fields except 'transcripts' and 'qa'.
|
| 86 |
-
The 'url' field is renamed to 'source' for LangChain compatibility.
|
| 87 |
-
|
| 88 |
-
Yields:
|
| 89 |
-
Document: LangChain Document with page_content as concatenated transcript
|
| 90 |
-
and metadata containing video information
|
| 91 |
-
"""
|
| 92 |
-
|
| 93 |
-
for video in self.json_payload:
|
| 94 |
-
metadata = dict(video)
|
| 95 |
-
metadata.pop("transcripts", None)
|
| 96 |
-
metadata.pop("qa", None)
|
| 97 |
-
# Rename 'url' key to 'source' in metadata if it exists
|
| 98 |
-
if "url" in metadata:
|
| 99 |
-
metadata["source"] = metadata.pop("url")
|
| 100 |
-
yield Document(
|
| 101 |
-
page_content="\n".join(
|
| 102 |
-
t["sent"] for t in video["transcripts"]
|
| 103 |
-
),
|
| 104 |
-
metadata=metadata,
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
class VideoTranscriptChunkLoader(BaseLoader):
|
| 109 |
-
"""
|
| 110 |
-
Loads video transcripts as individual chunk documents for fine-grained processing.
|
| 111 |
-
|
| 112 |
-
Each transcript segment becomes a separate document with timing information.
|
| 113 |
-
Useful for precise timestamp-based retrieval and time-sensitive queries.
|
| 114 |
-
|
| 115 |
-
Inherits from LangChain's BaseLoader for compatibility with document processing chains.
|
| 116 |
-
|
| 117 |
-
Attributes:
|
| 118 |
-
json_payload (List[Dict]): List of video dictionaries containing transcript data
|
| 119 |
-
"""
|
| 120 |
-
|
| 121 |
-
def __init__(self, json_payload: List[Dict]):
|
| 122 |
-
"""
|
| 123 |
-
Initialize the chunk loader with video transcript data.
|
| 124 |
-
|
| 125 |
-
Args:
|
| 126 |
-
json_payload (List[Dict]): List of video dictionaries, each containing:
|
| 127 |
-
- transcripts: List of transcript segments with timing
|
| 128 |
-
- qa: Q&A data (optional)
|
| 129 |
-
- url: Video URL
|
| 130 |
-
- other metadata fields
|
| 131 |
-
"""
|
| 132 |
-
|
| 133 |
-
self.json_payload = json_payload
|
| 134 |
-
|
| 135 |
-
def lazy_load(self) -> Iterator[Document]:
|
| 136 |
-
"""
|
| 137 |
-
Lazy loader that yields individual Document objects for each transcript segment.
|
| 138 |
-
|
| 139 |
-
Creates one Document per transcript segment with timing metadata.
|
| 140 |
-
Each document contains a single transcript sentence with precise start/end times.
|
| 141 |
-
The 'url' field is renamed to 'source' for LangChain compatibility.
|
| 142 |
-
|
| 143 |
-
Yields:
|
| 144 |
-
Document: LangChain Document with page_content as single transcript sentence
|
| 145 |
-
and metadata containing video info plus time_start and time_end
|
| 146 |
-
"""
|
| 147 |
-
|
| 148 |
-
for video in self.json_payload:
|
| 149 |
-
metadata = dict(video)
|
| 150 |
-
transcripts = metadata.pop("transcripts", None)
|
| 151 |
-
metadata.pop("qa", None)
|
| 152 |
-
# Rename 'url' key to 'source' in metadata if it exists
|
| 153 |
-
if "url" in metadata:
|
| 154 |
-
metadata["source"] = metadata.pop("url")
|
| 155 |
-
for transcript in transcripts:
|
| 156 |
-
yield Document(
|
| 157 |
-
page_content=transcript["sent"],
|
| 158 |
-
metadata=metadata
|
| 159 |
-
| {
|
| 160 |
-
"time_start": transcript["begin"],
|
| 161 |
-
"time_end": transcript["end"],
|
| 162 |
-
},
|
| 163 |
-
)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
async def chunk_transcripts(
|
| 167 |
-
json_transcripts: List[Dict[str, Any]],
|
| 168 |
-
semantic_chunker_embedding_model: Embeddings = OpenAIEmbeddings(
|
| 169 |
-
model="text-embedding-3-small"
|
| 170 |
-
),
|
| 171 |
-
) -> List[Document]:
|
| 172 |
-
"""
|
| 173 |
-
Load and process video transcripts into semantically chunked documents.
|
| 174 |
-
|
| 175 |
-
This function takes a list of transcript dictionaries, loads them as both full
|
| 176 |
-
transcripts and individual chunks, then applies semantic chunking. It also
|
| 177 |
-
enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
|
| 178 |
-
|
| 179 |
-
Args:
|
| 180 |
-
json_transcripts: List of dictionaries containing video transcript data
|
| 181 |
-
embeddings: OpenAI embeddings model to use for semantic chunking
|
| 182 |
-
|
| 183 |
-
Returns:
|
| 184 |
-
List of semantically chunked Document objects with enhanced metadata
|
| 185 |
-
"""
|
| 186 |
-
|
| 187 |
-
docs_full_transcript: List[Document] = VideoTranscriptBulkLoader(
|
| 188 |
-
json_payload=json_transcripts
|
| 189 |
-
).load()
|
| 190 |
-
docs_chunks_verbatim: List[Document] = VideoTranscriptChunkLoader(
|
| 191 |
-
json_payload=json_transcripts
|
| 192 |
-
).load()
|
| 193 |
-
|
| 194 |
-
# semantically split the combined transcript
|
| 195 |
-
text_splitter = SemanticChunker(semantic_chunker_embedding_model)
|
| 196 |
-
docs_group = await asyncio.gather(
|
| 197 |
-
*[
|
| 198 |
-
text_splitter.atransform_documents(d)
|
| 199 |
-
for d in batch(docs_full_transcript, size=2)
|
| 200 |
-
]
|
| 201 |
-
)
|
| 202 |
-
# Flatten the nested list of documents
|
| 203 |
-
docs_chunks_semantic: List[Document] = []
|
| 204 |
-
for group in docs_group:
|
| 205 |
-
docs_chunks_semantic.extend(group)
|
| 206 |
-
|
| 207 |
-
# Create a lookup dictionary for faster access
|
| 208 |
-
video_id_to_chunks: Dict[int, List[Document]] = {}
|
| 209 |
-
for chunk in docs_chunks_verbatim:
|
| 210 |
-
video_id: int = chunk.metadata["video_id"]
|
| 211 |
-
if video_id not in video_id_to_chunks:
|
| 212 |
-
video_id_to_chunks[video_id] = []
|
| 213 |
-
video_id_to_chunks[video_id].append(chunk)
|
| 214 |
-
|
| 215 |
-
for chunk in docs_chunks_semantic:
|
| 216 |
-
video_id = chunk.metadata["video_id"]
|
| 217 |
-
# Only check chunks from the same video
|
| 218 |
-
potential_subchunks = video_id_to_chunks.get(video_id, [])
|
| 219 |
-
subchunks = [
|
| 220 |
-
c
|
| 221 |
-
for c in potential_subchunks
|
| 222 |
-
if c.page_content in chunk.page_content
|
| 223 |
-
]
|
| 224 |
-
|
| 225 |
-
times = [
|
| 226 |
-
(t.metadata["time_start"], t.metadata["time_end"])
|
| 227 |
-
for t in subchunks
|
| 228 |
-
]
|
| 229 |
-
chunk.metadata["speech_start_stop_times"] = times
|
| 230 |
-
|
| 231 |
-
if times: # Avoid IndexError if times is empty
|
| 232 |
-
chunk.metadata["start"], chunk.metadata["stop"] = (
|
| 233 |
-
times[0][0],
|
| 234 |
-
times[-1][-1],
|
| 235 |
-
)
|
| 236 |
-
else:
|
| 237 |
-
chunk.metadata["start"], chunk.metadata["stop"] = None, None
|
| 238 |
-
|
| 239 |
-
docs_chunks_semantic[0].metadata.keys()
|
| 240 |
-
return docs_chunks_semantic
|
| 241 |
|
| 242 |
|
| 243 |
class DatastoreManager:
|
|
@@ -315,17 +102,45 @@ class DatastoreManager:
|
|
| 315 |
|
| 316 |
self.docs = []
|
| 317 |
|
| 318 |
-
async def from_json_globs(self, globs: List[str]) -> int:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
|
|
|
|
|
|
| 320 |
logging.debug("Starting to load files.")
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
logging.debug("Uploaded %d records.", count)
|
| 325 |
|
| 326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
-
async def
|
| 329 |
"""
|
| 330 |
Populate the vector database with processed video transcript documents.
|
| 331 |
|
|
@@ -345,12 +160,6 @@ class DatastoreManager:
|
|
| 345 |
Exception: If embedding generation or database upload fails
|
| 346 |
"""
|
| 347 |
|
| 348 |
-
# perform chunking
|
| 349 |
-
self.docs: List[Document] = await chunk_transcripts(
|
| 350 |
-
json_transcripts=raw_docs,
|
| 351 |
-
semantic_chunker_embedding_model=self.embeddings,
|
| 352 |
-
)
|
| 353 |
-
|
| 354 |
# perform embedding
|
| 355 |
|
| 356 |
vector_batches = await asyncio.gather(
|
|
@@ -358,7 +167,7 @@ class DatastoreManager:
|
|
| 358 |
self.embeddings.aembed_documents(
|
| 359 |
[c.page_content for c in chunk_batch]
|
| 360 |
)
|
| 361 |
-
for chunk_batch in batch(
|
| 362 |
]
|
| 363 |
)
|
| 364 |
vectors = []
|
|
@@ -375,7 +184,7 @@ class DatastoreManager:
|
|
| 375 |
"metadata": doc.metadata,
|
| 376 |
},
|
| 377 |
)
|
| 378 |
-
for id, vector, doc in zip(ids, vectors,
|
| 379 |
]
|
| 380 |
|
| 381 |
# upload qdrant payload
|
|
@@ -384,14 +193,6 @@ class DatastoreManager:
|
|
| 384 |
points=points,
|
| 385 |
)
|
| 386 |
|
| 387 |
-
self.loading_complete.set()
|
| 388 |
-
# Execute callbacks (both sync and async)
|
| 389 |
-
for callback in self._completion_callbacks:
|
| 390 |
-
if asyncio.iscoroutinefunction(callback):
|
| 391 |
-
await callback()
|
| 392 |
-
else:
|
| 393 |
-
callback()
|
| 394 |
-
|
| 395 |
return len(points)
|
| 396 |
|
| 397 |
def count_docs(self) -> int:
|
|
@@ -479,7 +280,7 @@ class DatastoreManager:
|
|
| 479 |
return False
|
| 480 |
|
| 481 |
|
| 482 |
-
async def
|
| 483 |
"""
|
| 484 |
Asynchronously load and parse a single JSON file containing video data.
|
| 485 |
|
|
@@ -502,53 +303,164 @@ async def load_single_json(filepath: str):
|
|
| 502 |
Note:
|
| 503 |
Uses async file I/O for better performance when loading multiple files
|
| 504 |
"""
|
| 505 |
-
my_path = Path(filepath)
|
| 506 |
|
| 507 |
-
async with aiofiles.open(
|
| 508 |
content = await f.read()
|
| 509 |
payload = json.loads(content)
|
| 510 |
for entry in payload:
|
| 511 |
-
entry.update({"group": str(
|
| 512 |
return payload
|
| 513 |
|
| 514 |
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
"""
|
| 517 |
-
|
| 518 |
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
Args:
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
patterns with ** for subdirectory traversal.
|
| 527 |
|
| 528 |
Returns:
|
| 529 |
-
List
|
| 530 |
-
with each video containing its source group information
|
| 531 |
-
|
| 532 |
-
Raises:
|
| 533 |
-
FileNotFoundError: If any matched file doesn't exist during loading
|
| 534 |
-
json.JSONDecodeError: If any file content is not valid JSON format
|
| 535 |
-
PermissionError: If any file cannot be read due to permissions
|
| 536 |
-
|
| 537 |
-
Example:
|
| 538 |
-
>>> patterns = ["data/*.json", "archive/**/*.json"]
|
| 539 |
-
>>> videos = await load_json_files(patterns)
|
| 540 |
-
>>> len(videos) # Total videos from all matched files
|
| 541 |
"""
|
| 542 |
-
logging.debug("Loading from %d globs:", len(glob_list))
|
| 543 |
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
logging.debug("Total files: %d", len(files))
|
| 551 |
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
from qdrant_client.http.models import Distance, VectorParams
|
| 25 |
from qdrant_client.models import PointStruct
|
| 26 |
|
| 27 |
+
from pstuts_rag.utils import EmbeddingsAPISelector, flatten, batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
class DatastoreManager:
|
|
|
|
| 102 |
|
| 103 |
self.docs = []
|
| 104 |
|
| 105 |
+
async def from_json_globs(self, globs: List[str] | str) -> int:
|
| 106 |
+
"""
|
| 107 |
+
Populate the vector database with processed video transcript documents,
|
| 108 |
+
retrieved from JSON file paths.
|
| 109 |
+
|
| 110 |
+
This method performs the complete pipeline:
|
| 111 |
+
- loading JSON transcripts
|
| 112 |
+
- semantic chunking with timestamp preservation
|
| 113 |
+
-
|
| 114 |
|
| 115 |
+
|
| 116 |
+
"""
|
| 117 |
logging.debug("Starting to load files.")
|
| 118 |
+
files = globs_to_paths(globs)
|
| 119 |
+
|
| 120 |
+
tasks = [load_json_file(f) for f in files]
|
| 121 |
+
results = await asyncio.gather(*tasks)
|
| 122 |
+
|
| 123 |
+
json_transcripts = list(flatten(results))
|
| 124 |
+
logging.debug("Received %d JSON files.", len(json_transcripts))
|
| 125 |
+
|
| 126 |
+
# perform chunking
|
| 127 |
+
self.docs: List[Document] = await chunk_transcripts(
|
| 128 |
+
json_transcripts=json_transcripts,
|
| 129 |
+
semantic_chunker_embedding_model=self.embeddings,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
count = await self.embed_chunks(self.docs)
|
| 133 |
logging.debug("Uploaded %d records.", count)
|
| 134 |
|
| 135 |
+
self.loading_complete.set()
|
| 136 |
+
# Execute callbacks (both sync and async)
|
| 137 |
+
for callback in self._completion_callbacks:
|
| 138 |
+
if asyncio.iscoroutinefunction(callback):
|
| 139 |
+
await callback()
|
| 140 |
+
else:
|
| 141 |
+
callback()
|
| 142 |
|
| 143 |
+
async def embed_chunks(self, chunked_documents: List[Document]) -> int:
|
| 144 |
"""
|
| 145 |
Populate the vector database with processed video transcript documents.
|
| 146 |
|
|
|
|
| 160 |
Exception: If embedding generation or database upload fails
|
| 161 |
"""
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
# perform embedding
|
| 164 |
|
| 165 |
vector_batches = await asyncio.gather(
|
|
|
|
| 167 |
self.embeddings.aembed_documents(
|
| 168 |
[c.page_content for c in chunk_batch]
|
| 169 |
)
|
| 170 |
+
for chunk_batch in batch(chunked_documents, 8)
|
| 171 |
]
|
| 172 |
)
|
| 173 |
vectors = []
|
|
|
|
| 184 |
"metadata": doc.metadata,
|
| 185 |
},
|
| 186 |
)
|
| 187 |
+
for id, vector, doc in zip(ids, vectors, chunked_documents)
|
| 188 |
]
|
| 189 |
|
| 190 |
# upload qdrant payload
|
|
|
|
| 193 |
points=points,
|
| 194 |
)
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
return len(points)
|
| 197 |
|
| 198 |
def count_docs(self) -> int:
|
|
|
|
| 280 |
return False
|
| 281 |
|
| 282 |
|
| 283 |
+
async def load_json_file(filepath: Path):
|
| 284 |
"""
|
| 285 |
Asynchronously load and parse a single JSON file containing video data.
|
| 286 |
|
|
|
|
| 303 |
Note:
|
| 304 |
Uses async file I/O for better performance when loading multiple files
|
| 305 |
"""
|
|
|
|
| 306 |
|
| 307 |
+
async with aiofiles.open(filepath, mode="r", encoding="utf-8") as f:
|
| 308 |
content = await f.read()
|
| 309 |
payload = json.loads(content)
|
| 310 |
for entry in payload:
|
| 311 |
+
entry.update({"group": str(filepath)})
|
| 312 |
return payload
|
| 313 |
|
| 314 |
|
| 315 |
+
def globs_to_paths(glob_list: List[str] | str) -> List[Path]:
|
| 316 |
+
|
| 317 |
+
if isinstance(glob_list, str):
|
| 318 |
+
glob_list = glob_list.split(":")
|
| 319 |
+
|
| 320 |
+
logging.debug("Loading from %d globs:", len(glob_list))
|
| 321 |
+
|
| 322 |
+
files: List[Path] = []
|
| 323 |
+
for globstring in glob_list:
|
| 324 |
+
logging.debug("Loading glob: %s", globstring)
|
| 325 |
+
new_files = [Path(f) for f in glob.glob(globstring, recursive=True)]
|
| 326 |
+
files.extend(filter(lambda f: f.exists(), new_files))
|
| 327 |
+
|
| 328 |
+
logging.debug("Total files: %d", len(files))
|
| 329 |
+
|
| 330 |
+
return files
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def load_transcripts_whole(json_payload: List[Dict]) -> Iterator[Document]:
|
| 334 |
+
"""
|
| 335 |
+
Lazy loader that yields Document objects with concatenated transcripts.
|
| 336 |
+
|
| 337 |
+
Creates one Document per video with all transcript sentences joined by newlines.
|
| 338 |
+
Metadata includes all video fields except 'transcripts' and 'qa'.
|
| 339 |
+
The 'url' field is renamed to 'source' for LangChain compatibility.
|
| 340 |
+
|
| 341 |
+
Yields:
|
| 342 |
+
Document: LangChain Document with page_content as concatenated transcript
|
| 343 |
+
and metadata containing video information
|
| 344 |
+
"""
|
| 345 |
+
|
| 346 |
+
for video in json_payload:
|
| 347 |
+
metadata = dict(video)
|
| 348 |
+
metadata.pop("transcripts", None)
|
| 349 |
+
metadata.pop("qa", None)
|
| 350 |
+
# Rename 'url' key to 'source' in metadata if it exists
|
| 351 |
+
if "url" in metadata:
|
| 352 |
+
metadata["source"] = metadata.pop("url")
|
| 353 |
+
yield Document(
|
| 354 |
+
page_content="\n".join(t["sent"] for t in video["transcripts"]),
|
| 355 |
+
metadata=metadata,
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def load_transcripts_segments(
|
| 360 |
+
json_payload: List[Dict],
|
| 361 |
+
) -> Iterator[Document]:
|
| 362 |
"""
|
| 363 |
+
Lazy loader that yields individual Document objects for each transcript segment.
|
| 364 |
|
| 365 |
+
Creates one Document per transcript segment with timing metadata.
|
| 366 |
+
Each document contains a single transcript sentence with precise start/end times.
|
| 367 |
+
The 'url' field is renamed to 'source' for LangChain compatibility.
|
| 368 |
+
|
| 369 |
+
Yields:
|
| 370 |
+
Document: LangChain Document with page_content as single transcript sentence
|
| 371 |
+
and metadata containing video info plus time_start and time_end
|
| 372 |
+
"""
|
| 373 |
+
|
| 374 |
+
for video in json_payload:
|
| 375 |
+
metadata = dict(video)
|
| 376 |
+
transcripts = metadata.pop("transcripts", None)
|
| 377 |
+
metadata.pop("qa", None)
|
| 378 |
+
# Rename 'url' key to 'source' in metadata if it exists
|
| 379 |
+
if "url" in metadata:
|
| 380 |
+
metadata["source"] = metadata.pop("url")
|
| 381 |
+
for transcript in transcripts:
|
| 382 |
+
yield Document(
|
| 383 |
+
page_content=transcript["sent"],
|
| 384 |
+
metadata=metadata
|
| 385 |
+
| {
|
| 386 |
+
"time_start": transcript["begin"],
|
| 387 |
+
"time_end": transcript["end"],
|
| 388 |
+
},
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
async def chunk_transcripts(
|
| 393 |
+
json_transcripts: List[Dict[str, Any]],
|
| 394 |
+
semantic_chunker_embedding_model: Embeddings = OpenAIEmbeddings(
|
| 395 |
+
model="text-embedding-3-small"
|
| 396 |
+
),
|
| 397 |
+
) -> List[Document]:
|
| 398 |
+
"""
|
| 399 |
+
Load and process video transcripts into semantically chunked documents.
|
| 400 |
+
|
| 401 |
+
This function takes a list of transcript dictionaries, loads them as both full
|
| 402 |
+
transcripts and individual chunks, then applies semantic chunking. It also
|
| 403 |
+
enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
|
| 404 |
|
| 405 |
Args:
|
| 406 |
+
json_transcripts: List of dictionaries containing video transcript data
|
| 407 |
+
embeddings: OpenAI embeddings model to use for semantic chunking
|
|
|
|
| 408 |
|
| 409 |
Returns:
|
| 410 |
+
List of semantically chunked Document objects with enhanced metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
"""
|
|
|
|
| 412 |
|
| 413 |
+
docs_full_transcript: List[Document] = list(
|
| 414 |
+
load_transcripts_whole(json_transcripts)
|
| 415 |
+
)
|
| 416 |
+
docs_chunks_verbatim: List[Document] = list(
|
| 417 |
+
load_transcripts_segments(json_transcripts)
|
| 418 |
+
)
|
|
|
|
| 419 |
|
| 420 |
+
# semantically split the combined transcript
|
| 421 |
+
text_splitter = SemanticChunker(semantic_chunker_embedding_model)
|
| 422 |
+
docs_group = await asyncio.gather(
|
| 423 |
+
*[
|
| 424 |
+
text_splitter.atransform_documents(d)
|
| 425 |
+
for d in batch(docs_full_transcript, size=2)
|
| 426 |
+
]
|
| 427 |
+
)
|
| 428 |
+
# Flatten the nested list of documents
|
| 429 |
+
docs_chunks_semantic: List[Document] = []
|
| 430 |
+
for group in docs_group:
|
| 431 |
+
docs_chunks_semantic.extend(group)
|
| 432 |
+
|
| 433 |
+
# Create a lookup dictionary for faster access
|
| 434 |
+
video_id_to_chunks: Dict[int, List[Document]] = {}
|
| 435 |
+
for chunk in docs_chunks_verbatim:
|
| 436 |
+
video_id: int = chunk.metadata["video_id"]
|
| 437 |
+
if video_id not in video_id_to_chunks:
|
| 438 |
+
video_id_to_chunks[video_id] = []
|
| 439 |
+
video_id_to_chunks[video_id].append(chunk)
|
| 440 |
+
|
| 441 |
+
for chunk in docs_chunks_semantic:
|
| 442 |
+
video_id = chunk.metadata["video_id"]
|
| 443 |
+
# Only check chunks from the same video
|
| 444 |
+
potential_subchunks = video_id_to_chunks.get(video_id, [])
|
| 445 |
+
subchunks = [
|
| 446 |
+
c
|
| 447 |
+
for c in potential_subchunks
|
| 448 |
+
if c.page_content in chunk.page_content
|
| 449 |
+
]
|
| 450 |
+
|
| 451 |
+
times = [
|
| 452 |
+
(t.metadata["time_start"], t.metadata["time_end"])
|
| 453 |
+
for t in subchunks
|
| 454 |
+
]
|
| 455 |
+
chunk.metadata["speech_start_stop_times"] = times
|
| 456 |
+
|
| 457 |
+
if times: # Avoid IndexError if times is empty
|
| 458 |
+
chunk.metadata["start"], chunk.metadata["stop"] = (
|
| 459 |
+
times[0][0],
|
| 460 |
+
times[-1][-1],
|
| 461 |
+
)
|
| 462 |
+
else:
|
| 463 |
+
chunk.metadata["start"], chunk.metadata["stop"] = None, None
|
| 464 |
+
|
| 465 |
+
docs_chunks_semantic[0].metadata.keys()
|
| 466 |
+
return docs_chunks_semantic
|
pstuts_rag/pstuts_rag/graph.py
CHANGED
|
@@ -30,7 +30,7 @@ from app import (
|
|
| 30 |
enter_chain,
|
| 31 |
)
|
| 32 |
|
| 33 |
-
from pstuts_rag.rag_for_transcripts import
|
| 34 |
|
| 35 |
|
| 36 |
def search_agent(state: PsTutsTeamState, chain: Runnable) -> Dict:
|
|
@@ -244,7 +244,7 @@ async def build_the_graph(current_state: ApplicationState):
|
|
| 244 |
)
|
| 245 |
|
| 246 |
rag_node, _ = create_rag_node(
|
| 247 |
-
rag_chain=
|
| 248 |
name=VIDEOARCHIVE,
|
| 249 |
)
|
| 250 |
|
|
|
|
| 30 |
enter_chain,
|
| 31 |
)
|
| 32 |
|
| 33 |
+
from pstuts_rag.rag_for_transcripts import create_transcript_rag_chain
|
| 34 |
|
| 35 |
|
| 36 |
def search_agent(state: PsTutsTeamState, chain: Runnable) -> Dict:
|
|
|
|
| 244 |
)
|
| 245 |
|
| 246 |
rag_node, _ = create_rag_node(
|
| 247 |
+
rag_chain=create_transcript_rag_chain(),
|
| 248 |
name=VIDEOARCHIVE,
|
| 249 |
)
|
| 250 |
|
pstuts_rag/pstuts_rag/rag.py
CHANGED
|
@@ -248,7 +248,7 @@ class RAGChainInstance:
|
|
| 248 |
qdrant_client=self.qdrant_client, name=self.name
|
| 249 |
)
|
| 250 |
if self.datastore_manager.count_docs() == 0:
|
| 251 |
-
self.pointsLoaded = await self.datastore_manager.
|
| 252 |
raw_docs=json_payload
|
| 253 |
)
|
| 254 |
logging.info(
|
|
|
|
| 248 |
qdrant_client=self.qdrant_client, name=self.name
|
| 249 |
)
|
| 250 |
if self.datastore_manager.count_docs() == 0:
|
| 251 |
+
self.pointsLoaded = await self.datastore_manager.embed_chunks(
|
| 252 |
raw_docs=json_payload
|
| 253 |
)
|
| 254 |
logging.info(
|
pstuts_rag/pstuts_rag/rag_for_transcripts.py
CHANGED
|
@@ -18,7 +18,7 @@ from langchain_ollama import ChatOllama
|
|
| 18 |
|
| 19 |
from .datastore import DatastoreManager
|
| 20 |
from .prompts import RAG_PROMPT_TEMPLATES
|
| 21 |
-
|
| 22 |
from pstuts_rag.configuration import Configuration, ModelAPI
|
| 23 |
|
| 24 |
|
|
@@ -37,6 +37,7 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
|
|
| 37 |
answer: AIMessage = msg_dict["answer"]
|
| 38 |
input = msg_dict["input"]
|
| 39 |
|
|
|
|
| 40 |
reference_dicts = [
|
| 41 |
{k: doc.metadata[k] for k in ("title", "source", "start", "stop")}
|
| 42 |
for doc in input["context"]
|
|
@@ -44,11 +45,13 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
|
|
| 44 |
references = str(json.dumps(reference_dicts, indent=2))
|
| 45 |
|
| 46 |
text_w_references = answer.content
|
|
|
|
| 47 |
if "I don't know" not in answer.content:
|
| 48 |
text_w_references = "\n".join(
|
| 49 |
[str(text_w_references), "**REFERENCES**", references]
|
| 50 |
)
|
| 51 |
|
|
|
|
| 52 |
output: AIMessage = answer.model_copy(
|
| 53 |
update={
|
| 54 |
"content": text_w_references,
|
|
@@ -63,88 +66,60 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
|
|
| 63 |
return output
|
| 64 |
|
| 65 |
|
| 66 |
-
def
|
| 67 |
datastore: DatastoreManager,
|
| 68 |
config: Union[RunnableConfig, Configuration] = Configuration(),
|
| 69 |
) -> Runnable:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
|
|
|
| 71 |
configurable = (
|
| 72 |
config
|
| 73 |
if isinstance(config, Configuration)
|
| 74 |
else Configuration.from_runnable_config(config)
|
| 75 |
)
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
ModelAPI.OPENAI: ChatOpenAI,
|
| 80 |
-
ModelAPI.OLLAMA: ChatOllama,
|
| 81 |
-
}.get(configurable.llm_api, ChatOpenAI)
|
| 82 |
|
| 83 |
llm = cls(model=configurable.llm_tool_model)
|
| 84 |
|
|
|
|
| 85 |
answer_chain = (
|
| 86 |
ChatPromptTemplate.from_messages(list(RAG_PROMPT_TEMPLATES.items()))
|
| 87 |
| llm
|
| 88 |
)
|
| 89 |
|
|
|
|
|
|
|
| 90 |
rag_chain = (
|
| 91 |
-
itemgetter("question")
|
| 92 |
-
| RunnableParallel(
|
| 93 |
context=datastore.get_retriever(
|
| 94 |
n_context_docs=configurable.n_context_docs
|
| 95 |
),
|
| 96 |
-
question=RunnablePassthrough(),
|
| 97 |
)
|
| 98 |
-
| {
|
| 99 |
-
"input": RunnablePassthrough(),
|
| 100 |
-
"answer": answer_chain,
|
| 101 |
}
|
| 102 |
-
| pack_references
|
| 103 |
)
|
| 104 |
|
| 105 |
return rag_chain
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
def startup(
|
| 109 |
-
config=Configuration(),
|
| 110 |
-
callback_on_loading_complete: Optional[Callable] = None,
|
| 111 |
-
):
|
| 112 |
-
"""
|
| 113 |
-
Initialize the application with optional loading completion callback.
|
| 114 |
-
|
| 115 |
-
Args:
|
| 116 |
-
config: Configuration object with application settings
|
| 117 |
-
on_loading_complete: Optional callback (sync or async) to call when
|
| 118 |
-
datastore loading completes
|
| 119 |
-
|
| 120 |
-
Returns:
|
| 121 |
-
DatastoreManager: The initialized datastore manager
|
| 122 |
-
"""
|
| 123 |
-
|
| 124 |
-
### PROCESS THE CONFIGURATION
|
| 125 |
-
log_level = getattr(logging, config.eva_log_level, logging.INFO)
|
| 126 |
-
logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")
|
| 127 |
-
|
| 128 |
-
### CREATE THE DATABASE
|
| 129 |
-
|
| 130 |
-
datastore = DatastoreManager()
|
| 131 |
-
if callback_on_loading_complete:
|
| 132 |
-
datastore.add_completion_callback(callback_on_loading_complete)
|
| 133 |
-
|
| 134 |
-
### START DATABASE POPULATION
|
| 135 |
-
|
| 136 |
-
globs = [str(g) for g in config.transcript_glob.split(":")]
|
| 137 |
-
|
| 138 |
-
# # Add custom callback if provided, otherwise use default logging
|
| 139 |
-
# if on_loading_complete:
|
| 140 |
-
# datastore.add_completion_callback(on_loading_complete)
|
| 141 |
-
# else:
|
| 142 |
-
# # Default callback for logging
|
| 143 |
-
# def default_logging_callback():
|
| 144 |
-
# logging.info("🎉 Datastore loading completed!")
|
| 145 |
-
|
| 146 |
-
# datastore.add_completion_callback(default_logging_callback)
|
| 147 |
-
|
| 148 |
-
asyncio.create_task(datastore.from_json_globs(globs))
|
| 149 |
-
|
| 150 |
-
return datastore
|
|
|
|
| 18 |
|
| 19 |
from .datastore import DatastoreManager
|
| 20 |
from .prompts import RAG_PROMPT_TEMPLATES
|
| 21 |
+
from pstuts_rag.utils import ChatAPISelector
|
| 22 |
from pstuts_rag.configuration import Configuration, ModelAPI
|
| 23 |
|
| 24 |
|
|
|
|
| 37 |
answer: AIMessage = msg_dict["answer"]
|
| 38 |
input = msg_dict["input"]
|
| 39 |
|
| 40 |
+
# Extract relevant metadata from each document in the context
|
| 41 |
reference_dicts = [
|
| 42 |
{k: doc.metadata[k] for k in ("title", "source", "start", "stop")}
|
| 43 |
for doc in input["context"]
|
|
|
|
| 45 |
references = str(json.dumps(reference_dicts, indent=2))
|
| 46 |
|
| 47 |
text_w_references = answer.content
|
| 48 |
+
# Only append references if the model provided a substantive answer
|
| 49 |
if "I don't know" not in answer.content:
|
| 50 |
text_w_references = "\n".join(
|
| 51 |
[str(text_w_references), "**REFERENCES**", references]
|
| 52 |
)
|
| 53 |
|
| 54 |
+
# Create new message with references and preserve original context metadata
|
| 55 |
output: AIMessage = answer.model_copy(
|
| 56 |
update={
|
| 57 |
"content": text_w_references,
|
|
|
|
| 66 |
return output
|
| 67 |
|
| 68 |
|
| 69 |
+
def create_transcript_rag_chain(
|
| 70 |
datastore: DatastoreManager,
|
| 71 |
config: Union[RunnableConfig, Configuration] = Configuration(),
|
| 72 |
) -> Runnable:
|
| 73 |
+
"""Create a Retrieval-Augmented Generation (RAG) chain for video transcript search.
|
| 74 |
+
|
| 75 |
+
This function constructs a complete RAG pipeline that:
|
| 76 |
+
1. Takes a user question as input
|
| 77 |
+
2. Retrieves relevant video transcript chunks from the datastore
|
| 78 |
+
3. Generates an answer using an LLM with the retrieved context
|
| 79 |
+
4. Packages the response with reference information
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
datastore: The DatastoreManager containing video transcript embeddings
|
| 83 |
+
config: Configuration object or RunnableConfig with model and retrieval settings
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Runnable: A LangChain runnable that processes questions and returns
|
| 87 |
+
answers with embedded references to source video segments
|
| 88 |
+
"""
|
| 89 |
|
| 90 |
+
# Handle both Configuration objects and RunnableConfig dictionaries
|
| 91 |
configurable = (
|
| 92 |
config
|
| 93 |
if isinstance(config, Configuration)
|
| 94 |
else Configuration.from_runnable_config(config)
|
| 95 |
)
|
| 96 |
|
| 97 |
+
# Select the appropriate chat model class based on configuration
|
| 98 |
+
cls = ChatAPISelector.get(configurable.llm_api, ChatOpenAI)
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
llm = cls(model=configurable.llm_tool_model)
|
| 101 |
|
| 102 |
+
# Create the answer generation chain using prompt templates
|
| 103 |
answer_chain = (
|
| 104 |
ChatPromptTemplate.from_messages(list(RAG_PROMPT_TEMPLATES.items()))
|
| 105 |
| llm
|
| 106 |
)
|
| 107 |
|
| 108 |
+
# Build the complete RAG chain with the following flow:
|
| 109 |
+
# question -> parallel(context_retrieval, question_passthrough) -> llm_answer -> pack_references
|
| 110 |
rag_chain = (
|
| 111 |
+
itemgetter("question") # Extract question from input dict
|
| 112 |
+
| RunnableParallel( # Run context retrieval and question passing in parallel
|
| 113 |
context=datastore.get_retriever(
|
| 114 |
n_context_docs=configurable.n_context_docs
|
| 115 |
),
|
| 116 |
+
question=RunnablePassthrough(), # Pass question unchanged
|
| 117 |
)
|
| 118 |
+
| { # Prepare input dict for final processing
|
| 119 |
+
"input": RunnablePassthrough(), # Contains both context and question
|
| 120 |
+
"answer": answer_chain, # Generate answer using retrieved context
|
| 121 |
}
|
| 122 |
+
| pack_references # Add reference metadata to the final response
|
| 123 |
)
|
| 124 |
|
| 125 |
return rag_chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|