mbudisic commited on
Commit
c419378
·
1 Parent(s): cf7b605

Videos displayed; streaming responses

Browse files
app.py CHANGED
@@ -11,6 +11,7 @@ from langchain_core.documents import Document
11
  from langchain_core.language_models import BaseChatModel
12
  from langchain_core.runnables import Runnable
13
  from langchain_qdrant import QdrantVectorStore
 
14
  from qdrant_client import QdrantClient
15
  from dataclasses import dataclass
16
 
@@ -19,7 +20,7 @@ import pstuts_rag.rag, pstuts_rag.datastore
19
 
20
  @dataclass
21
  class ApplicationParameters:
22
- filename = "data/test.json"
23
  embedding_model = "text-embedding-3-small"
24
  n_context_docs = 2
25
  llm_model = "gpt-4.1-mini"
@@ -42,6 +43,9 @@ class ApplicationState:
42
  llm: BaseChatModel
43
  rag_chain: Runnable
44
 
 
 
 
45
  def __init__(self) -> None:
46
  load_dotenv()
47
  set_api_key_if_not_present("OPENAI_API_KEY")
@@ -53,8 +57,13 @@ params = ApplicationParameters()
53
 
54
  async def fill_the_db():
55
  if state.datastore_manager.count_docs() == 0:
56
- data: List[Dict[str, Any]] = json.load(open(params.filename, "rb"))
57
- await state.datastore_manager.populate_database(raw_docs=data)
 
 
 
 
 
58
 
59
 
60
  async def build_the_chain():
@@ -80,10 +89,36 @@ async def on_chat_start():
80
  @cl.on_message
81
  async def main(message: cl.Message):
82
  # Send a response back to the user
83
-
84
  response = await state.rag_chain.ainvoke({"question": message.content})
85
 
86
- await cl.Message(content=response.content).send()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
  if __name__ == "__main__":
 
11
  from langchain_core.language_models import BaseChatModel
12
  from langchain_core.runnables import Runnable
13
  from langchain_qdrant import QdrantVectorStore
14
+ from pstuts_rag.loader import load_json_files, load_single_json
15
  from qdrant_client import QdrantClient
16
  from dataclasses import dataclass
17
 
 
20
 
21
  @dataclass
22
  class ApplicationParameters:
23
+ filename = [f"data/{f}.json" for f in ["dev"]]
24
  embedding_model = "text-embedding-3-small"
25
  n_context_docs = 2
26
  llm_model = "gpt-4.1-mini"
 
43
  llm: BaseChatModel
44
  rag_chain: Runnable
45
 
46
+ hasLoaded: asyncio.Event = asyncio.Event()
47
+ pointsLoaded: int = 0
48
+
49
  def __init__(self) -> None:
50
  load_dotenv()
51
  set_api_key_if_not_present("OPENAI_API_KEY")
 
57
 
58
  async def fill_the_db():
59
  if state.datastore_manager.count_docs() == 0:
60
+ data: List[Dict[str, Any]] = await load_json_files(params.filename)
61
+ state.pointsLoaded = await state.datastore_manager.populate_database(
62
+ raw_docs=data
63
+ )
64
+ await cl.Message(
65
+ content=f"✅ The database has been loaded with {state.pointsLoaded} elements!"
66
+ ).send()
67
 
68
 
69
  async def build_the_chain():
 
89
  @cl.on_message
90
  async def main(message: cl.Message):
91
  # Send a response back to the user
92
+ msg = cl.Message(content="")
93
  response = await state.rag_chain.ainvoke({"question": message.content})
94
 
95
+ text, references = pstuts_rag.rag.RAGChainFactory.unpack_references(
96
+ response.content
97
+ )
98
+ if isinstance(text, str):
99
+ for token in [char for char in text]:
100
+ await msg.stream_token(token)
101
+
102
+ await msg.send()
103
+
104
+ references = json.loads(references)
105
+ print(references)
106
+
107
+ msg_references = [
108
+ (
109
+ f"Watch {ref["title"]} from timestamp "
110
+ f"{round(ref["start"] // 60)}m:{round(ref["start"] % 60)}s",
111
+ cl.Video(
112
+ name=ref["title"],
113
+ url=f"{ref["source"]}#t={ref["start"]}",
114
+ display="side",
115
+ ),
116
+ )
117
+ for ref in references
118
+ ]
119
+ await cl.Message(content="Related videos").send()
120
+ for e in msg_references:
121
+ await cl.Message(content=e[0], elements=[e[1]]).send()
122
 
123
 
124
  if __name__ == "__main__":
notebooks/transcript_rag.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -15,7 +15,7 @@
15
  },
16
  {
17
  "cell_type": "code",
18
- "execution_count": 2,
19
  "metadata": {},
20
  "outputs": [],
21
  "source": [
@@ -24,9 +24,18 @@
24
  },
25
  {
26
  "cell_type": "code",
27
- "execution_count": 3,
28
  "metadata": {},
29
- "outputs": [],
 
 
 
 
 
 
 
 
 
30
  "source": [
31
  "%load_ext autoreload\n",
32
  "%autoreload 2\n"
@@ -34,7 +43,7 @@
34
  },
35
  {
36
  "cell_type": "code",
37
- "execution_count": 4,
38
  "metadata": {},
39
  "outputs": [],
40
  "source": [
@@ -50,7 +59,7 @@
50
  },
51
  {
52
  "cell_type": "code",
53
- "execution_count": 5,
54
  "metadata": {},
55
  "outputs": [],
56
  "source": [
@@ -78,15 +87,47 @@
78
  },
79
  {
80
  "cell_type": "code",
81
- "execution_count": 6,
82
  "metadata": {},
83
  "outputs": [],
84
  "source": [
85
  "from ast import Dict\n",
86
  "import json\n",
87
- "filename = \"../data/test.json\"\n",
 
 
88
  "from typing import List, Dict, Any\n",
89
- "data:List[Dict[str,Any]] = json.load(open(filename, \"rb\"))\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  ]
91
  },
92
  {
@@ -105,7 +146,7 @@
105
  },
106
  {
107
  "cell_type": "code",
108
- "execution_count": 7,
109
  "metadata": {},
110
  "outputs": [],
111
  "source": [
@@ -139,7 +180,7 @@
139
  },
140
  {
141
  "cell_type": "code",
142
- "execution_count": 8,
143
  "metadata": {},
144
  "outputs": [],
145
  "source": [
@@ -150,7 +191,7 @@
150
  },
151
  {
152
  "cell_type": "code",
153
- "execution_count": 9,
154
  "metadata": {},
155
  "outputs": [],
156
  "source": [
@@ -161,7 +202,7 @@
161
  },
162
  {
163
  "cell_type": "code",
164
- "execution_count": 10,
165
  "metadata": {},
166
  "outputs": [],
167
  "source": [
@@ -171,7 +212,7 @@
171
  },
172
  {
173
  "cell_type": "code",
174
- "execution_count": 11,
175
  "metadata": {},
176
  "outputs": [],
177
  "source": [
@@ -180,19 +221,19 @@
180
  },
181
  {
182
  "cell_type": "code",
183
- "execution_count": 12,
184
  "metadata": {},
185
  "outputs": [
186
  {
187
  "data": {
188
  "text/plain": [
189
  "{'refusal': None,\n",
190
- " 'context': [Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 21, '_collection_name': 'local_test'}, page_content=\"Layers are the building blocks of any image in Photoshop CC. So, it's important to understand, what layers are and why to use them - which we'll cover in this video. If you're following along, open this layered image from the downloadable practice files for this tutorial. You might think of layers like separate flat pints of glass, stacked one on top of the other. Each layer contains separate pieces of content. To get a sense of how layers are constructed, let's take a look at this Layers panel. I've closed my other panels, so that we can focus on the Layers panel. But you can skip that. By the way: If your Layers panel isn't showing, go up to the Window menu and choose Layers from there. The Layers panel is where you go to select and work with layers. In this image there are 4 layers, each with separate content. If you click the Eye icon to the left of a layer, you can toggle the visibility of that layer off and on. So, I'm going to turn off the visibility of the tailor layer. And keep your eye on the image, so you can see what's on that layer.\"),\n",
191
- " Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[85.75, 88.659999], [89.42, 100.11], [101.469999, 108.64], [109.09, 117.459999], [117.75, 129.45], [129.97, 133.37], [133.73, 143.98], [144.76, 152.97]], 'start': 85.75, 'stop': 152.97, '_id': 23, '_collection_name': 'local_test'}, page_content=\"Now let's take a look at just one layer, the tailor layer. A quick way to turn off all the layers except the tailor layer, is to hold down the Option key on the Mac, or the ALT key on the PC, and click on the Eye icon to the left of the tailor layer. In the Document window, you can see that this layer contains just the one small photo surrounded by a gray and white checkerboard pattern. That pattern represents transparent pixels, which allow us to see down through the corresponding part of this layer to the content of the layers below. So, let's turn that content back on by going back to the Layers panel, again holding the Option key on the Mac or the ALT key on the PC and clicking on the Eye icon to the left of the tailor layer. And all the other layers and their Eye icons come back into view. So again: You might think of layers like a stack of pints of glass, each with its own artwork and in some cases transparent areas that let you see down through to the layers below. The biggest benefit of having items on separate layers like this, is that you'll be able to edit pieces of an image independently without affecting the rest of the image.\")],\n",
192
  " 'question': 'What are layers'}"
193
  ]
194
  },
195
- "execution_count": 12,
196
  "metadata": {},
197
  "output_type": "execute_result"
198
  }
@@ -203,7 +244,7 @@
203
  },
204
  {
205
  "cell_type": "code",
206
- "execution_count": 13,
207
  "metadata": {},
208
  "outputs": [
209
  {
@@ -212,7 +253,7 @@
212
  "text": [
213
  "==================================\u001b[1m Ai Message \u001b[0m==================================\n",
214
  "\n",
215
- "Layers are the building blocks of any image in Photoshop CC. You can think of layers like separate flat panes of glass stacked on top of each other, with each layer containing separate pieces of content. Some parts of a layer can be transparent, allowing you to see through to the layers below. This setup lets you edit parts of an image independently without affecting the rest of the image. You work with layers in the Layers panel, where you can toggle their visibility on and off. (See explanation around 0:28 to 1:00 and 1:25 to 2:32) 🎨🖼️\n",
216
  "**References**:\n",
217
  "[\n",
218
  " {\n",
@@ -237,16 +278,16 @@
237
  },
238
  {
239
  "cell_type": "code",
240
- "execution_count": 14,
241
  "metadata": {},
242
  "outputs": [
243
  {
244
  "data": {
245
  "text/plain": [
246
- "'Layers are the building blocks of any image in Photoshop CC. You can think of layers like separate flat panes of glass stacked on top of each other, with each layer containing separate pieces of content. Some parts of a layer can be transparent, allowing you to see through to the layers below. This setup lets you edit parts of an image independently without affecting the rest of the image. You work with layers in the Layers panel, where you can toggle their visibility on and off. (See explanation around 0:28 to 1:00 and 1:25 to 2:32) 🎨🖼️\\n**References**:\\n[\\n {\\n \"title\": \"Understand layers\",\\n \"source\": \"https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4\",\\n \"start\": 0.47,\\n \"stop\": 62.14\\n },\\n {\\n \"title\": \"Understand layers\",\\n \"source\": \"https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4\",\\n \"start\": 85.75,\\n \"stop\": 152.97\\n }\\n]'"
247
  ]
248
  },
249
- "execution_count": 14,
250
  "metadata": {},
251
  "output_type": "execute_result"
252
  }
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 36,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
15
  },
16
  {
17
  "cell_type": "code",
18
+ "execution_count": 37,
19
  "metadata": {},
20
  "outputs": [],
21
  "source": [
 
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 38,
28
  "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "name": "stdout",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "The autoreload extension is already loaded. To reload it, use:\n",
35
+ " %reload_ext autoreload\n"
36
+ ]
37
+ }
38
+ ],
39
  "source": [
40
  "%load_ext autoreload\n",
41
  "%autoreload 2\n"
 
43
  },
44
  {
45
  "cell_type": "code",
46
+ "execution_count": 39,
47
  "metadata": {},
48
  "outputs": [],
49
  "source": [
 
59
  },
60
  {
61
  "cell_type": "code",
62
+ "execution_count": 40,
63
  "metadata": {},
64
  "outputs": [],
65
  "source": [
 
87
  },
88
  {
89
  "cell_type": "code",
90
+ "execution_count": 53,
91
  "metadata": {},
92
  "outputs": [],
93
  "source": [
94
  "from ast import Dict\n",
95
  "import json\n",
96
+ "\n",
97
+ "from pstuts_rag.loader import load_json_files\n",
98
+ "filename = [\"../data/test.json\",\"../data/dev.json\"]\n",
99
  "from typing import List, Dict, Any\n",
100
+ "data:List[Dict[str,Any]] = await load_json_files(filename)\n"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 56,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "data": {
110
+ "text/plain": [
111
+ "['Get organized with layer groups',\n",
112
+ " 'Remove unwanted objects from photos',\n",
113
+ " 'Include vector graphics',\n",
114
+ " 'Remove unwanted content',\n",
115
+ " 'Add a central element',\n",
116
+ " 'Set the resolution',\n",
117
+ " 'Understand layers',\n",
118
+ " 'Adjust brightness and contrast',\n",
119
+ " 'Remove a large object',\n",
120
+ " 'Add text',\n",
121
+ " 'Replace a background using a layer mask']"
122
+ ]
123
+ },
124
+ "execution_count": 56,
125
+ "metadata": {},
126
+ "output_type": "execute_result"
127
+ }
128
+ ],
129
+ "source": [
130
+ "[ d[\"title\"] for d in data ]"
131
  ]
132
  },
133
  {
 
146
  },
147
  {
148
  "cell_type": "code",
149
+ "execution_count": 43,
150
  "metadata": {},
151
  "outputs": [],
152
  "source": [
 
180
  },
181
  {
182
  "cell_type": "code",
183
+ "execution_count": 44,
184
  "metadata": {},
185
  "outputs": [],
186
  "source": [
 
191
  },
192
  {
193
  "cell_type": "code",
194
+ "execution_count": 45,
195
  "metadata": {},
196
  "outputs": [],
197
  "source": [
 
202
  },
203
  {
204
  "cell_type": "code",
205
+ "execution_count": 46,
206
  "metadata": {},
207
  "outputs": [],
208
  "source": [
 
212
  },
213
  {
214
  "cell_type": "code",
215
+ "execution_count": 47,
216
  "metadata": {},
217
  "outputs": [],
218
  "source": [
 
221
  },
222
  {
223
  "cell_type": "code",
224
+ "execution_count": 48,
225
  "metadata": {},
226
  "outputs": [
227
  {
228
  "data": {
229
  "text/plain": [
230
  "{'refusal': None,\n",
231
+ " 'context': [Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 21, '_collection_name': 'local_test'}, page_content=\"Layers are the building blocks of any image in Photoshop CC. So, it's important to understand, what layers are and why to use them - which we'll cover in this video. If you're following along, open this layered image from the downloadable practice files for this tutorial. You might think of layers like separate flat pints of glass, stacked one on top of the other. Each layer contains separate pieces of content. To get a sense of how layers are constructed, let's take a look at this Layers panel. I've closed my other panels, so that we can focus on the Layers panel. But you can skip that. By the way: If your Layers panel isn't showing, go up to the Window menu and choose Layers from there. The Layers panel is where you go to select and work with layers. In this image there are 4 layers, each with separate content. If you click the Eye icon to the left of a layer, you can toggle the visibility of that layer off and on. So, I'm going to turn off the visibility of the tailor layer. And keep your eye on the image, so you can see what's on that layer.\"),\n",
232
+ " Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[85.75, 88.659999], [89.42, 100.11], [101.469999, 108.64], [109.09, 117.459999], [117.75, 129.45], [129.97, 133.37], [133.73, 143.98], [144.76, 152.97]], 'start': 85.75, 'stop': 152.97, '_id': 23, '_collection_name': 'local_test'}, page_content=\"Now let's take a look at just one layer, the tailor layer. A quick way to turn off all the layers except the tailor layer, is to hold down the Option key on the Mac, or the ALT key on the PC, and click on the Eye icon to the left of the tailor layer. In the Document window, you can see that this layer contains just the one small photo surrounded by a gray and white checkerboard pattern. That pattern represents transparent pixels, which allow us to see down through the corresponding part of this layer to the content of the layers below. So, let's turn that content back on by going back to the Layers panel, again holding the Option key on the Mac or the ALT key on the PC and clicking on the Eye icon to the left of the tailor layer. And all the other layers and their Eye icons come back into view. So again: You might think of layers like a stack of pints of glass, each with its own artwork and in some cases transparent areas that let you see down through to the layers below. The biggest benefit of having items on separate layers like this, is that you'll be able to edit pieces of an image independently without affecting the rest of the image.\")],\n",
233
  " 'question': 'What are layers'}"
234
  ]
235
  },
236
+ "execution_count": 48,
237
  "metadata": {},
238
  "output_type": "execute_result"
239
  }
 
244
  },
245
  {
246
  "cell_type": "code",
247
+ "execution_count": 49,
248
  "metadata": {},
249
  "outputs": [
250
  {
 
253
  "text": [
254
  "==================================\u001b[1m Ai Message \u001b[0m==================================\n",
255
  "\n",
256
+ "Layers are the building blocks of any image in Photoshop CC. You can think of layers like separate flat panes of glass stacked on top of each other. Each layer contains separate pieces of content. Some parts of a layer can be transparent, allowing you to see through to the layers below. This setup lets you edit parts of an image independently without affecting the rest of the image. You manage and work with layers in the Layers panel, where you can toggle their visibility on and off using the Eye icon. (See explanation around 0:281:00 and 1:252:32) 🎨🖼️\n",
257
  "**References**:\n",
258
  "[\n",
259
  " {\n",
 
278
  },
279
  {
280
  "cell_type": "code",
281
+ "execution_count": 50,
282
  "metadata": {},
283
  "outputs": [
284
  {
285
  "data": {
286
  "text/plain": [
287
+ "'Layers are the building blocks of any image in Photoshop CC. You can think of layers like separate flat panes of glass stacked on top of each other. Each layer contains separate pieces of content. Some parts of a layer can be transparent, allowing you to see through to the layers below. This setup lets you edit parts of an image independently without affecting the rest of the image. You manage and work with layers in the Layers panel, where you can toggle their visibility on and off using the Eye icon. (See explanation around 0:281:00 and 1:252:32) 🎨🖼️\\n**References**:\\n[\\n {\\n \"title\": \"Understand layers\",\\n \"source\": \"https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4\",\\n \"start\": 0.47,\\n \"stop\": 62.14\\n },\\n {\\n \"title\": \"Understand layers\",\\n \"source\": \"https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4\",\\n \"start\": 85.75,\\n \"stop\": 152.97\\n }\\n]'"
288
  ]
289
  },
290
+ "execution_count": 50,
291
  "metadata": {},
292
  "output_type": "execute_result"
293
  }
pstuts_rag/pstuts_rag/datastore.py CHANGED
@@ -159,7 +159,7 @@ class DatastoreManager:
159
 
160
  self.docs = []
161
 
162
- async def populate_database(self, raw_docs: List[Dict[str, Any]]):
163
 
164
  # perform chunking
165
  self.docs: List[Document] = await chunk_transcripts(
@@ -200,6 +200,8 @@ class DatastoreManager:
200
  points=points,
201
  )
202
 
 
 
203
  def count_docs(self) -> int:
204
  try:
205
  count = self.qdrant_client.get_collection(self.name).points_count
 
159
 
160
  self.docs = []
161
 
162
+ async def populate_database(self, raw_docs: List[Dict[str, Any]]) -> int:
163
 
164
  # perform chunking
165
  self.docs: List[Document] = await chunk_transcripts(
 
200
  points=points,
201
  )
202
 
203
+ return len(points)
204
+
205
  def count_docs(self) -> int:
206
  try:
207
  count = self.qdrant_client.get_collection(self.name).points_count
pstuts_rag/pstuts_rag/loader.py CHANGED
@@ -1,7 +1,34 @@
 
 
1
  from langchain_core.document_loaders import BaseLoader
2
  from typing import List, Dict, Iterator
3
  from langchain_core.documents import Document
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  class VideoTranscriptBulkLoader(BaseLoader):
7
  """Loads video transcripts as a bulk into documents"""
@@ -21,7 +48,9 @@ class VideoTranscriptBulkLoader(BaseLoader):
21
  if "url" in metadata:
22
  metadata["source"] = metadata.pop("url")
23
  yield Document(
24
- page_content="\n".join(t["sent"] for t in video["transcripts"]),
 
 
25
  metadata=metadata,
26
  )
27
 
 
1
+ import glob
2
+ import json
3
  from langchain_core.document_loaders import BaseLoader
4
  from typing import List, Dict, Iterator
5
  from langchain_core.documents import Document
6
 
7
+ import aiofiles
8
+ import asyncio
9
+ from pathlib import Path
10
+
11
+
12
+ async def load_single_json(filepath):
13
+ my_path = Path(filepath)
14
+
15
+ async with aiofiles.open(my_path, mode="r", encoding="utf-8") as f:
16
+ content = await f.read()
17
+ payload: List[Dict] = json.loads(content)
18
+ [video.update({"group": my_path.name}) for video in payload]
19
+
20
+ return payload
21
+
22
+
23
+ async def load_json_files(path_pattern: List[str]):
24
+ files = []
25
+ for f in path_pattern:
26
+ (files.extend(glob.glob(f, recursive=True)))
27
+
28
+ tasks = [load_single_json(f) for f in files]
29
+ results = await asyncio.gather(*tasks)
30
+ return [item for sublist in results for item in sublist] # flatten
31
+
32
 
33
  class VideoTranscriptBulkLoader(BaseLoader):
34
  """Loads video transcripts as a bulk into documents"""
 
48
  if "url" in metadata:
49
  metadata["source"] = metadata.pop("url")
50
  yield Document(
51
+ page_content="\n".join(
52
+ t["sent"] for t in video["transcripts"]
53
+ ),
54
  metadata=metadata,
55
  )
56
 
pstuts_rag/pstuts_rag/rag.py CHANGED
@@ -7,9 +7,10 @@ This module provides the core RAG functionality, including:
7
 
8
  import json
9
  from multiprocessing import Value
 
10
  import uuid
11
  from operator import itemgetter
12
- from typing import Dict, List, Any
13
 
14
  from langchain_core.documents import Document
15
  from langchain_core.runnables import (
@@ -97,7 +98,7 @@ class RAGChainFactory:
97
  )
98
 
99
  text_w_references = "\n".join(
100
- [answer.content, "**References**:", references]
101
  )
102
 
103
  output: AIMessage = answer.model_copy(
@@ -113,6 +114,20 @@ class RAGChainFactory:
113
 
114
  return output
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def __init__(
117
  self,
118
  retriever: VectorStoreRetriever,
 
7
 
8
  import json
9
  from multiprocessing import Value
10
+ import re
11
  import uuid
12
  from operator import itemgetter
13
+ from typing import Dict, List, Any, Tuple
14
 
15
  from langchain_core.documents import Document
16
  from langchain_core.runnables import (
 
98
  )
99
 
100
  text_w_references = "\n".join(
101
+ [answer.content, "**REFERENCES**", references]
102
  )
103
 
104
  output: AIMessage = answer.model_copy(
 
114
 
115
  return output
116
 
117
+ @staticmethod
118
+ def unpack_references(content: str) -> Tuple[str, str]:
119
+ parts = re.split(r"\*\*REFERENCES\*\*\s*", content, maxsplit=1)
120
+
121
+ if len(parts) == 2:
122
+ text = parts[0].rstrip()
123
+ references = parts[1].lstrip()
124
+ return text, references
125
+
126
+ else:
127
+ raise ValueError(
128
+ f"No '**References:**' section found in input:\n{content}"
129
+ )
130
+
131
  def __init__(
132
  self,
133
  retriever: VectorStoreRetriever,