Spaces:

mbudisic
/

PsTuts-RAG

Sleeping

App Files Files Community

mbudisic commited on May 31

Commit

3b978ee

1 Parent(s): 9e054dd

dev: simplifying datastore functions

Browse files

Files changed (7) hide show

app_simple_rag.py +1 -1
notebooks/transcript_rag.ipynb +78 -69
pstuts_rag/pstuts_rag/configuration.py +2 -0
pstuts_rag/pstuts_rag/datastore.py +183 -271
pstuts_rag/pstuts_rag/graph.py +2 -2
pstuts_rag/pstuts_rag/rag.py +1 -1
pstuts_rag/pstuts_rag/rag_for_transcripts.py +34 -59

app_simple_rag.py CHANGED Viewed

@@ -59,7 +59,7 @@ params = ApplicationParameters()
 async def fill_the_db():
     if state.datastore_manager.count_docs() == 0:
         data: List[Dict[str, Any]] = await load_json_files(params.filename)
-        state.pointsLoaded = await state.datastore_manager.populate_database(
             raw_docs=data
         )
         await cl.Message(

 async def fill_the_db():
     if state.datastore_manager.count_docs() == 0:
         data: List[Dict[str, Any]] = await load_json_files(params.filename)
+        state.pointsLoaded = await state.datastore_manager.embed_chunks(
             raw_docs=data
         )
         await cl.Message(

notebooks/transcript_rag.ipynb CHANGED Viewed

@@ -86,7 +86,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-05-30 20:08:35,183 - INFO - <module> - Loaded .env file\n"
      ]
     }
    ],
@@ -103,16 +103,22 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-05-30 20:08:36,978 - INFO - print - Configuration parameters:\n",
-      "2025-05-30 20:08:36,980 - INFO - print -   eva_workflow_name: EVA_workflow\n",
-      "2025-05-30 20:08:36,980 - INFO - print -   eva_log_level: INFO\n",
-      "2025-05-30 20:08:36,981 - INFO - print -   transcript_glob: ./data/dev.json:./data/test.json\n",
-      "2025-05-30 20:08:36,982 - INFO - print -   embedding_model: mxbai-embed-large\n",
-      "2025-05-30 20:08:36,983 - INFO - print -   embedding_api: ModelAPI.OLLAMA\n",
-      "2025-05-30 20:08:36,984 - INFO - print -   llm_api: ModelAPI.OLLAMA\n",
-      "2025-05-30 20:08:36,985 - INFO - print -   max_research_loops: 2\n",
-      "2025-05-30 20:08:36,986 - INFO - print -   llm_tool_model: deepseek-r1:8b\n",
-      "2025-05-30 20:08:36,987 - INFO - print -   n_context_docs: 3\n"
      ]
     }
    ],
@@ -129,13 +135,25 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-05-30 20:08:37,093 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:37,118 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
      ]
     }
    ],
    "source": [
-    "datastore:DatastoreManager = startup(callback_on_loading_complete=lambda _: logging.warning(\"Loading complete.\")) "
    ]
   },
   {
@@ -147,39 +165,39 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-05-30 20:08:38,120 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:39,173 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:39,862 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:40,765 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:41,275 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:41,539 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:42,447 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:43,415 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:44,236 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:45,746 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:45,770 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:46,832 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:47,754 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:48,859 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:49,732 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:50,740 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:51,604 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:52,113 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:53,060 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:53,895 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:54,734 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:55,707 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:56,114 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:56,447 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:56,765 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:56,878 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:57,200 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:57,438 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:57,750 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:58,116 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:58,713 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:59,059 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:08:59,110 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
      ]
     },
     {
@@ -203,7 +221,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "chain = retrieve_videos(datastore)"
    ]
   },
   {
@@ -215,8 +233,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-05-30 20:08:59,268 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
-      "2025-05-30 20:09:11,924 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/chat \"HTTP/1.1 200 OK\"\n"
      ]
     }
    ],
@@ -236,29 +254,20 @@
       "==================================\u001b[1m Ai Message \u001b[0m==================================\n",
       "\n",
       "<think>\n",
-      "Okay, I need to figure out what a layer is based on the provided transcript. Let me go through the context step by step.\n",
-      "\n",
-      "First, looking at the first document with video ID 19172. The title says \"Understand layers\" and the description mentions that layers are the building blocks of any image in Photoshop CC. So, layers must be fundamental components.\n",
       "\n",
-      "In the page content, it compares layers to separate flat prints of glass stacked on top of each other. Each layer has different content. That makes me think of layers as separate elements or parts of an image that can be edited individually.\n",
       "\n",
-      "There's also a mention of the Layers panel where you select and work with layers. It shows 4 layers, each with distinct content. The Eye icon can toggle visibility, so layers can be shown or hidden. This suggests that layers are like different layers of content that can be managed separately.\n",
       "\n",
-      "Looking at another document with video ID 4103, it says layers allow isolation of individual pieces of content within a composition. For example, text or brush strokes can be separated into their own layers for detailed editing without affecting other parts.\n",
       "\n",
-      "Another mention from the same video talks about using layers for adding highlights or shadows by isolating small elements, which means each part can be worked on independently.\n",
-      "\n",
-      "Putting this together, a layer is like a separate sheet in an image that holds different elements. Each layer allows you to edit, move, or manipulate specific parts without affecting others. The Layers panel helps manage and control these layers for better organization and editing flexibility.\n",
       "</think>\n",
       "\n",
-      "🎨 **What is a Layer?**  \n",
-      "Layers are like separate sheets in an image, each holding distinct content. Think of them as individual elements stacked on top of each other, allowing you to edit or manipulate specific parts without affecting others. \n",
-      "\n",
-      "For example:  \n",
-      "- Each layer can contain text, images, or design elements.  \n",
-      "- You can toggle their visibility using the Eye icon.  \n",
       "\n",
-      "📌 **Timestamp**: 0.47 - 3.41 minutes (video ID 19172)\n",
       "**REFERENCES**\n",
       "[\n",
       "  {\n",
@@ -295,9 +304,9 @@
     {
      "data": {
       "text/plain": [
-       "[Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 63, '_collection_name': 'dc0cf104-0069-4983-8a12-8d3de4132745'}, page_content=\"Layers are the building blocks of any image in Photoshop CC. So, it's important to understand, what layers are and why to use them - which we'll cover in this video. If you're following along, open this layered image from the downloadable practice files for this tutorial. You might think of layers like separate flat pints of glass, stacked one on top of the other. Each layer contains separate pieces of content. To get a sense of how layers are constructed, let's take a look at this Layers panel. I've closed my other panels, so that we can focus on the Layers panel. But you can skip that. By the way: If your Layers panel isn't showing, go up to the Window menu and choose Layers from there. The Layers panel is where you go to select and work with layers. In this image there are 4 layers, each with separate content. If you click the Eye icon to the left of a layer, you can toggle the visibility of that layer off and on. So, I'm going to turn off the visibility of the tailor layer. And keep your eye on the image, so you can see what's on that layer.\"),\n",
-       " Document(metadata={'video_id': 4103, 'title': 'Use layers for ultimate flexibility and control', 'desc': 'Learn how to use layers to create designs, fix photos, or build collages.', 'length': '00:05:06.55', 'group': 'data/dev.json', 'source': 'https://videos-tv.adobe.com/2014-09-04/96f51d8958ae31b37cb5a15cbdc21744.mp4', 'speech_start_stop_times': [[0.82, 5.88], [6.51, 18.389999], [19.219999, 30.13]], 'start': 0.82, 'stop': 30.13, '_id': 0, '_collection_name': 'dc0cf104-0069-4983-8a12-8d3de4132745'}, page_content=\"As a new Photoshop user, you're going to find that the layers panel is an incredibly powerful tool. The layers panel gives you the ability to isolate individual pieces of content away from the rest of the composition giving you the ability to work on individual elements within the overall document. Now, this can be used for something as literal as some type in this case, or something as subtle as a small brush stroke to add a highlight or shadow to an image.\"),\n",
-       " Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[280.4, 284.58]], 'start': 280.4, 'stop': 284.58, '_id': 66, '_collection_name': 'dc0cf104-0069-4983-8a12-8d3de4132745'}, page_content=\"There's lots more to learn about layers, so stay tuned for the rest of this tutorial.\")]"
       ]
      },
      "execution_count": 13,

      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-05-30 23:53:26,776 - INFO - <module> - Loaded .env file\n"
      ]
     }
    ],
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-05-30 23:53:28,613 - INFO - print - Configuration parameters:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-05-30 23:53:28,614 - INFO - print -   eva_workflow_name: EVA_workflow\n",
+      "2025-05-30 23:53:28,615 - INFO - print -   eva_log_level: INFO\n",
+      "2025-05-30 23:53:28,615 - INFO - print -   transcript_glob: ./data/dev.json:./data/test.json\n",
+      "2025-05-30 23:53:28,616 - INFO - print -   embedding_model: mxbai-embed-large\n",
+      "2025-05-30 23:53:28,617 - INFO - print -   embedding_api: ModelAPI.OLLAMA\n",
+      "2025-05-30 23:53:28,617 - INFO - print -   llm_api: ModelAPI.OLLAMA\n",
+      "2025-05-30 23:53:28,618 - INFO - print -   max_research_loops: 2\n",
+      "2025-05-30 23:53:28,619 - INFO - print -   llm_tool_model: qwen3:4b\n",
+      "2025-05-30 23:53:28,620 - INFO - print -   n_context_docs: 3\n"
      ]
     }
    ],
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-05-30 23:53:29,748 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:29,781 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<Task pending name='Task-1' coro=<DatastoreManager.from_json_globs() running at /home/mbudisic/Documents/PsTuts-RAG/pstuts_rag/pstuts_rag/datastore.py:105>>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
+    "datastore = DatastoreManager()\n",
+    "datastore.add_completion_callback(lambda _: logging.warning(\"Loading complete.\"))\n",
+    "asyncio.create_task(datastore.from_json_globs(Configuration().transcript_glob))"
    ]
   },
   {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-05-30 23:53:30,993 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:31,531 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:32,425 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:33,118 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:34,012 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:34,277 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:35,156 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:36,044 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:36,712 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:37,534 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:38,511 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:39,309 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:40,211 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:41,136 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:42,084 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:42,790 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:43,608 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:44,584 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:45,088 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:45,929 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:46,773 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:47,723 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:48,154 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:48,468 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:48,700 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:48,812 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:49,119 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:49,471 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:49,803 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:50,123 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:50,446 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:50,773 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:53:51,117 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n"
      ]
     },
     {
    "metadata": {},
    "outputs": [],
    "source": [
+    "chain = create_transcript_rag_chain(datastore)"
    ]
   },
   {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-05-30 23:53:51,268 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/embed \"HTTP/1.1 200 OK\"\n",
+      "2025-05-30 23:54:00,302 - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/chat \"HTTP/1.1 200 OK\"\n"
      ]
     }
    ],
       "==================================\u001b[1m Ai Message \u001b[0m==================================\n",
       "\n",
       "<think>\n",
+      "Okay, the user is asking, \"What is a layer?\" and I need to answer based only on the provided transcript excerpts.\n",
       "\n",
+      "First, I'll look through the context to find relevant information. The first document's page_content mentions that layers are the building blocks of any image in Photoshop CC. It also compares layers to separate flat pints of glass stacked one on top of another. Each layer has separate content. There's also mention of the Layers panel where you can select and work with layers, and that clicking the Eye icon toggles the visibility of a layer.\n",
       "\n",
+      "Another document talks about the layers panel being a powerful tool to isolate individual elements, allowing work on individual parts of the document. But that's more about using layers rather than defining what a layer is.\n",
       "\n",
+      "So, the key points are: layers are building blocks, they're like stacked glass with separate content, and the Layers panel is used to manage them. The timestamp for the first document's explanation is from 3.81 to 9.13, which is 0:03:41 to 0:09:13 in minute:seconds format.\n",
       "\n",
+      "I need to present this information clearly, using the timestamp. Make sure not to add any extra info not in the transcript. The answer should be concise and directly answer the question without assumptions.\n",
       "</think>\n",
       "\n",
+      "A **layer** in Photoshop is the building block of any image, allowing you to isolate and manage separate elements within a document. The transcript explains layers as \"separate flat pints of glass, stacked one on top of the other,\" where each layer contains its own content. You can toggle a layer’s visibility using the Eye icon in the **Layers panel**, which lets you control what’s visible in the image.  \n",
       "\n",
+      "**Timestamp**: 0:03:41–0:09:13 🎨✨\n",
       "**REFERENCES**\n",
       "[\n",
       "  {\n",
     {
      "data": {
       "text/plain": [
+       "[Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[0.47, 3.41], [3.81, 9.13], [9.309999, 15.01], [15.299999, 20.57], [20.88, 23.3], [23.83, 27.93], [29.38, 32.79], [32.96, 33.92], [34.43, 40.21], [41.91, 45.37], [45.88, 49.01], [49.54, 55.130001], [55.72, 58.49], [58.72, 62.14]], 'start': 0.47, 'stop': 62.14, '_id': 63, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"Layers are the building blocks of any image in Photoshop CC. So, it's important to understand, what layers are and why to use them - which we'll cover in this video. If you're following along, open this layered image from the downloadable practice files for this tutorial. You might think of layers like separate flat pints of glass, stacked one on top of the other. Each layer contains separate pieces of content. To get a sense of how layers are constructed, let's take a look at this Layers panel. I've closed my other panels, so that we can focus on the Layers panel. But you can skip that. By the way: If your Layers panel isn't showing, go up to the Window menu and choose Layers from there. The Layers panel is where you go to select and work with layers. In this image there are 4 layers, each with separate content. If you click the Eye icon to the left of a layer, you can toggle the visibility of that layer off and on. So, I'm going to turn off the visibility of the tailor layer. And keep your eye on the image, so you can see what's on that layer.\"),\n",
+       " Document(metadata={'video_id': 4103, 'title': 'Use layers for ultimate flexibility and control', 'desc': 'Learn how to use layers to create designs, fix photos, or build collages.', 'length': '00:05:06.55', 'group': 'data/dev.json', 'source': 'https://videos-tv.adobe.com/2014-09-04/96f51d8958ae31b37cb5a15cbdc21744.mp4', 'speech_start_stop_times': [[0.82, 5.88], [6.51, 18.389999], [19.219999, 30.13]], 'start': 0.82, 'stop': 30.13, '_id': 0, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"As a new Photoshop user, you're going to find that the layers panel is an incredibly powerful tool. The layers panel gives you the ability to isolate individual pieces of content away from the rest of the composition giving you the ability to work on individual elements within the overall document. Now, this can be used for something as literal as some type in this case, or something as subtle as a small brush stroke to add a highlight or shadow to an image.\"),\n",
+       " Document(metadata={'video_id': 19172, 'title': 'Understand layers', 'desc': 'Learn what layers are and why they are so useful.', 'length': '00:04:44.75', 'group': 'data/test.json', 'source': 'https://images-tv.adobe.com/avp/vr/b758b4c4-2a74-41f4-8e67-e2f2eab83c6a/f810fc5b-2b04-4e23-8fa4-5c532e7de6f8/e268fe4d-e5c7-415c-9f5c-d34d024b14d8_20170727011753.1280x720at2400_h264.mp4', 'speech_start_stop_times': [[280.4, 284.58]], 'start': 280.4, 'stop': 284.58, '_id': 66, '_collection_name': '30065fde-3368-41c8-9092-79b8646d229f'}, page_content=\"There's lots more to learn about layers, so stay tuned for the rest of this tutorial.\")]"
       ]
      },
      "execution_count": 13,

pstuts_rag/pstuts_rag/configuration.py CHANGED Viewed

@@ -70,6 +70,8 @@ class Configuration:
             if config and "configurable" in config
             else {}
         )
         values: dict[str, Any] = {
             f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
             for f in fields(cls)

             if config and "configurable" in config
             else {}
         )
+        # Map each dataclass field to environment variables or configurable values
+        # Priority: environment variables > configurable dict values > field defaults
         values: dict[str, Any] = {
             f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
             for f in fields(cls)

pstuts_rag/pstuts_rag/datastore.py CHANGED Viewed

@@ -24,220 +24,7 @@ from qdrant_client import QdrantClient
 from qdrant_client.http.models import Distance, VectorParams
 from qdrant_client.models import PointStruct
-from pstuts_rag.utils import EmbeddingsAPISelector
-def batch(iterable: List[Any], size: int = 16) -> Iterator[List[Any]]:
-    """
-    Batch an iterable into chunks of specified size.
-    Yields successive chunks from the input iterable, each containing
-    at most 'size' elements. Useful for processing large collections
-    in manageable batches to avoid memory issues or API rate limits.
-    Args:
-        iterable (List[Any]): The input list to be batched
-        size (int, optional): Maximum size of each batch. Defaults to 16.
-    Yields:
-        List[Any]: Successive batches of the input iterable
-    Example:
-        >>> list(batch([1, 2, 3, 4, 5], 2))
-        [[1, 2], [3, 4], [5]]
-    """
-    for i in range(0, len(iterable), size):
-        yield iterable[i : i + size]
-class VideoTranscriptBulkLoader(BaseLoader):
-    """
-    Loads video transcripts as bulk documents for document processing pipelines.
-    Each video becomes a single document with all transcript sentences concatenated.
-    Useful for semantic search across entire video content.
-    Inherits from LangChain's BaseLoader for compatibility with document processing chains.
-    Attributes:
-        json_payload (List[Dict]): List of video dictionaries containing transcript data
-    """
-    def __init__(self, json_payload: List[Dict]):
-        """
-        Initialize the bulk loader with video transcript data.
-        Args:
-            json_payload (List[Dict]): List of video dictionaries, each containing:
-                                     - transcripts: List of transcript segments
-                                     - qa: Q&A data (optional)
-                                     - url: Video URL
-                                     - other metadata fields
-        """
-        self.json_payload = json_payload
-    def lazy_load(self) -> Iterator[Document]:
-        """
-        Lazy loader that yields Document objects with concatenated transcripts.
-        Creates one Document per video with all transcript sentences joined by newlines.
-        Metadata includes all video fields except 'transcripts' and 'qa'.
-        The 'url' field is renamed to 'source' for LangChain compatibility.
-        Yields:
-            Document: LangChain Document with page_content as concatenated transcript
-                     and metadata containing video information
-        """
-        for video in self.json_payload:
-            metadata = dict(video)
-            metadata.pop("transcripts", None)
-            metadata.pop("qa", None)
-            # Rename 'url' key to 'source' in metadata if it exists
-            if "url" in metadata:
-                metadata["source"] = metadata.pop("url")
-            yield Document(
-                page_content="\n".join(
-                    t["sent"] for t in video["transcripts"]
-                ),
-                metadata=metadata,
-            )
-class VideoTranscriptChunkLoader(BaseLoader):
-    """
-    Loads video transcripts as individual chunk documents for fine-grained processing.
-    Each transcript segment becomes a separate document with timing information.
-    Useful for precise timestamp-based retrieval and time-sensitive queries.
-    Inherits from LangChain's BaseLoader for compatibility with document processing chains.
-    Attributes:
-        json_payload (List[Dict]): List of video dictionaries containing transcript data
-    """
-    def __init__(self, json_payload: List[Dict]):
-        """
-        Initialize the chunk loader with video transcript data.
-        Args:
-            json_payload (List[Dict]): List of video dictionaries, each containing:
-                                     - transcripts: List of transcript segments with timing
-                                     - qa: Q&A data (optional)
-                                     - url: Video URL
-                                     - other metadata fields
-        """
-        self.json_payload = json_payload
-    def lazy_load(self) -> Iterator[Document]:
-        """
-        Lazy loader that yields individual Document objects for each transcript segment.
-        Creates one Document per transcript segment with timing metadata.
-        Each document contains a single transcript sentence with precise start/end times.
-        The 'url' field is renamed to 'source' for LangChain compatibility.
-        Yields:
-            Document: LangChain Document with page_content as single transcript sentence
-                     and metadata containing video info plus time_start and time_end
-        """
-        for video in self.json_payload:
-            metadata = dict(video)
-            transcripts = metadata.pop("transcripts", None)
-            metadata.pop("qa", None)
-            # Rename 'url' key to 'source' in metadata if it exists
-            if "url" in metadata:
-                metadata["source"] = metadata.pop("url")
-            for transcript in transcripts:
-                yield Document(
-                    page_content=transcript["sent"],
-                    metadata=metadata
-                    | {
-                        "time_start": transcript["begin"],
-                        "time_end": transcript["end"],
-                    },
-                )
-async def chunk_transcripts(
-    json_transcripts: List[Dict[str, Any]],
-    semantic_chunker_embedding_model: Embeddings = OpenAIEmbeddings(
-        model="text-embedding-3-small"
-    ),
-) -> List[Document]:
-    """
-    Load and process video transcripts into semantically chunked documents.
-    This function takes a list of transcript dictionaries, loads them as both full
-    transcripts and individual chunks, then applies semantic chunking. It also
-    enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
-    Args:
-        json_transcripts: List of dictionaries containing video transcript data
-        embeddings: OpenAI embeddings model to use for semantic chunking
-    Returns:
-        List of semantically chunked Document objects with enhanced metadata
-    """
-    docs_full_transcript: List[Document] = VideoTranscriptBulkLoader(
-        json_payload=json_transcripts
-    ).load()
-    docs_chunks_verbatim: List[Document] = VideoTranscriptChunkLoader(
-        json_payload=json_transcripts
-    ).load()
-    # semantically split the combined transcript
-    text_splitter = SemanticChunker(semantic_chunker_embedding_model)
-    docs_group = await asyncio.gather(
-        *[
-            text_splitter.atransform_documents(d)
-            for d in batch(docs_full_transcript, size=2)
-        ]
-    )
-    # Flatten the nested list of documents
-    docs_chunks_semantic: List[Document] = []
-    for group in docs_group:
-        docs_chunks_semantic.extend(group)
-    # Create a lookup dictionary for faster access
-    video_id_to_chunks: Dict[int, List[Document]] = {}
-    for chunk in docs_chunks_verbatim:
-        video_id: int = chunk.metadata["video_id"]
-        if video_id not in video_id_to_chunks:
-            video_id_to_chunks[video_id] = []
-        video_id_to_chunks[video_id].append(chunk)
-    for chunk in docs_chunks_semantic:
-        video_id = chunk.metadata["video_id"]
-        # Only check chunks from the same video
-        potential_subchunks = video_id_to_chunks.get(video_id, [])
-        subchunks = [
-            c
-            for c in potential_subchunks
-            if c.page_content in chunk.page_content
-        ]
-        times = [
-            (t.metadata["time_start"], t.metadata["time_end"])
-            for t in subchunks
-        ]
-        chunk.metadata["speech_start_stop_times"] = times
-        if times:  # Avoid IndexError if times is empty
-            chunk.metadata["start"], chunk.metadata["stop"] = (
-                times[0][0],
-                times[-1][-1],
-            )
-        else:
-            chunk.metadata["start"], chunk.metadata["stop"] = None, None
-    docs_chunks_semantic[0].metadata.keys()
-    return docs_chunks_semantic
 class DatastoreManager:
@@ -315,17 +102,45 @@ class DatastoreManager:
         self.docs = []
-    async def from_json_globs(self, globs: List[str]) -> int:
         logging.debug("Starting to load files.")
-        data = await load_json_files(globs)
-        logging.debug("Received %d JSON files.", len(data))
-        count = await self.populate_database(data)
         logging.debug("Uploaded %d records.", count)
-        return count
-    async def populate_database(self, raw_docs: List[Dict[str, Any]]) -> int:
         """
         Populate the vector database with processed video transcript documents.
@@ -345,12 +160,6 @@ class DatastoreManager:
             Exception: If embedding generation or database upload fails
         """
-        # perform chunking
-        self.docs: List[Document] = await chunk_transcripts(
-            json_transcripts=raw_docs,
-            semantic_chunker_embedding_model=self.embeddings,
-        )
         # perform embedding
         vector_batches = await asyncio.gather(
@@ -358,7 +167,7 @@ class DatastoreManager:
                 self.embeddings.aembed_documents(
                     [c.page_content for c in chunk_batch]
                 )
-                for chunk_batch in batch(self.docs, 8)
             ]
         )
         vectors = []
@@ -375,7 +184,7 @@ class DatastoreManager:
                     "metadata": doc.metadata,
                 },
             )
-            for id, vector, doc in zip(ids, vectors, self.docs)
         ]
         # upload qdrant payload
@@ -384,14 +193,6 @@ class DatastoreManager:
             points=points,
         )
-        self.loading_complete.set()
-        # Execute callbacks (both sync and async)
-        for callback in self._completion_callbacks:
-            if asyncio.iscoroutinefunction(callback):
-                await callback()
-            else:
-                callback()
         return len(points)
     def count_docs(self) -> int:
@@ -479,7 +280,7 @@ class DatastoreManager:
             return False
-async def load_single_json(filepath: str):
     """
     Asynchronously load and parse a single JSON file containing video data.
@@ -502,53 +303,164 @@ async def load_single_json(filepath: str):
     Note:
         Uses async file I/O for better performance when loading multiple files
     """
-    my_path = Path(filepath)
-    async with aiofiles.open(my_path, mode="r", encoding="utf-8") as f:
         content = await f.read()
         payload = json.loads(content)
         for entry in payload:
-            entry.update({"group": str(my_path)})
     return payload
-async def load_json_files(glob_list: List[str]):
     """
-    Asynchronously load and parse multiple JSON files matching given patterns.
-    Uses glob patterns to find files and loads them concurrently for optimal performance.
-    All results are flattened into a single list for unified processing. This function
-    is designed to handle large datasets efficiently by leveraging async I/O.
     Args:
-        glob_list (List[str]): List of glob patterns to match JSON files.
-                                 Supports standard glob syntax including recursive
-                                 patterns with ** for subdirectory traversal.
     Returns:
-        List[Dict]: Flattened list of all video dictionaries from matched files,
-                   with each video containing its source group information
-    Raises:
-        FileNotFoundError: If any matched file doesn't exist during loading
-        json.JSONDecodeError: If any file content is not valid JSON format
-        PermissionError: If any file cannot be read due to permissions
-    Example:
-        >>> patterns = ["data/*.json", "archive/**/*.json"]
-        >>> videos = await load_json_files(patterns)
-        >>> len(videos)  # Total videos from all matched files
     """
-    logging.debug("Loading from %d globs:", len(glob_list))
-    files = []
-    for globstring in glob_list:
-        logging.debug("Loading glob: %s", globstring)
-        new_files = glob.glob(globstring, recursive=True)
-        logging.debug("New files: %d", len(new_files))
-        files.extend(new_files)
-    logging.debug("Total files: %d", len(files))
-    tasks = [load_single_json(f) for f in files]
-    results = await asyncio.gather(*tasks)
-    return [item for sublist in results for item in sublist]  # flatten

 from qdrant_client.http.models import Distance, VectorParams
 from qdrant_client.models import PointStruct
+from pstuts_rag.utils import EmbeddingsAPISelector, flatten, batch
 class DatastoreManager:
         self.docs = []
+    async def from_json_globs(self, globs: List[str] | str) -> int:
+        """
+        Populate the vector database with processed video transcript documents,
+        retrieved from JSON file paths.
+        This method performs the complete pipeline:
+            - loading JSON transcripts
+            - semantic chunking with timestamp preservation
+            -
+        """
         logging.debug("Starting to load files.")
+        files = globs_to_paths(globs)
+        tasks = [load_json_file(f) for f in files]
+        results = await asyncio.gather(*tasks)
+        json_transcripts = list(flatten(results))
+        logging.debug("Received %d JSON files.", len(json_transcripts))
+        # perform chunking
+        self.docs: List[Document] = await chunk_transcripts(
+            json_transcripts=json_transcripts,
+            semantic_chunker_embedding_model=self.embeddings,
+        )
+        count = await self.embed_chunks(self.docs)
         logging.debug("Uploaded %d records.", count)
+        self.loading_complete.set()
+        # Execute callbacks (both sync and async)
+        for callback in self._completion_callbacks:
+            if asyncio.iscoroutinefunction(callback):
+                await callback()
+            else:
+                callback()
+    async def embed_chunks(self, chunked_documents: List[Document]) -> int:
         """
         Populate the vector database with processed video transcript documents.
             Exception: If embedding generation or database upload fails
         """
         # perform embedding
         vector_batches = await asyncio.gather(
                 self.embeddings.aembed_documents(
                     [c.page_content for c in chunk_batch]
                 )
+                for chunk_batch in batch(chunked_documents, 8)
             ]
         )
         vectors = []
                     "metadata": doc.metadata,
                 },
             )
+            for id, vector, doc in zip(ids, vectors, chunked_documents)
         ]
         # upload qdrant payload
             points=points,
         )
         return len(points)
     def count_docs(self) -> int:
             return False
+async def load_json_file(filepath: Path):
     """
     Asynchronously load and parse a single JSON file containing video data.
     Note:
         Uses async file I/O for better performance when loading multiple files
     """
+    async with aiofiles.open(filepath, mode="r", encoding="utf-8") as f:
         content = await f.read()
         payload = json.loads(content)
         for entry in payload:
+            entry.update({"group": str(filepath)})
     return payload
+def globs_to_paths(glob_list: List[str] | str) -> List[Path]:
+    if isinstance(glob_list, str):
+        glob_list = glob_list.split(":")
+    logging.debug("Loading from %d globs:", len(glob_list))
+    files: List[Path] = []
+    for globstring in glob_list:
+        logging.debug("Loading glob: %s", globstring)
+        new_files = [Path(f) for f in glob.glob(globstring, recursive=True)]
+        files.extend(filter(lambda f: f.exists(), new_files))
+    logging.debug("Total files: %d", len(files))
+    return files
+def load_transcripts_whole(json_payload: List[Dict]) -> Iterator[Document]:
+    """
+    Lazy loader that yields Document objects with concatenated transcripts.
+    Creates one Document per video with all transcript sentences joined by newlines.
+    Metadata includes all video fields except 'transcripts' and 'qa'.
+    The 'url' field is renamed to 'source' for LangChain compatibility.
+    Yields:
+        Document: LangChain Document with page_content as concatenated transcript
+                    and metadata containing video information
+    """
+    for video in json_payload:
+        metadata = dict(video)
+        metadata.pop("transcripts", None)
+        metadata.pop("qa", None)
+        # Rename 'url' key to 'source' in metadata if it exists
+        if "url" in metadata:
+            metadata["source"] = metadata.pop("url")
+        yield Document(
+            page_content="\n".join(t["sent"] for t in video["transcripts"]),
+            metadata=metadata,
+        )
+def load_transcripts_segments(
+    json_payload: List[Dict],
+) -> Iterator[Document]:
     """
+    Lazy loader that yields individual Document objects for each transcript segment.
+    Creates one Document per transcript segment with timing metadata.
+    Each document contains a single transcript sentence with precise start/end times.
+    The 'url' field is renamed to 'source' for LangChain compatibility.
+    Yields:
+        Document: LangChain Document with page_content as single transcript sentence
+                    and metadata containing video info plus time_start and time_end
+    """
+    for video in json_payload:
+        metadata = dict(video)
+        transcripts = metadata.pop("transcripts", None)
+        metadata.pop("qa", None)
+        # Rename 'url' key to 'source' in metadata if it exists
+        if "url" in metadata:
+            metadata["source"] = metadata.pop("url")
+        for transcript in transcripts:
+            yield Document(
+                page_content=transcript["sent"],
+                metadata=metadata
+                | {
+                    "time_start": transcript["begin"],
+                    "time_end": transcript["end"],
+                },
+            )
+async def chunk_transcripts(
+    json_transcripts: List[Dict[str, Any]],
+    semantic_chunker_embedding_model: Embeddings = OpenAIEmbeddings(
+        model="text-embedding-3-small"
+    ),
+) -> List[Document]:
+    """
+    Load and process video transcripts into semantically chunked documents.
+    This function takes a list of transcript dictionaries, loads them as both full
+    transcripts and individual chunks, then applies semantic chunking. It also
+    enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
     Args:
+        json_transcripts: List of dictionaries containing video transcript data
+        embeddings: OpenAI embeddings model to use for semantic chunking
     Returns:
+        List of semantically chunked Document objects with enhanced metadata
     """
+    docs_full_transcript: List[Document] = list(
+        load_transcripts_whole(json_transcripts)
+    )
+    docs_chunks_verbatim: List[Document] = list(
+        load_transcripts_segments(json_transcripts)
+    )
+    # semantically split the combined transcript
+    text_splitter = SemanticChunker(semantic_chunker_embedding_model)
+    docs_group = await asyncio.gather(
+        *[
+            text_splitter.atransform_documents(d)
+            for d in batch(docs_full_transcript, size=2)
+        ]
+    )
+    # Flatten the nested list of documents
+    docs_chunks_semantic: List[Document] = []
+    for group in docs_group:
+        docs_chunks_semantic.extend(group)
+    # Create a lookup dictionary for faster access
+    video_id_to_chunks: Dict[int, List[Document]] = {}
+    for chunk in docs_chunks_verbatim:
+        video_id: int = chunk.metadata["video_id"]
+        if video_id not in video_id_to_chunks:
+            video_id_to_chunks[video_id] = []
+        video_id_to_chunks[video_id].append(chunk)
+    for chunk in docs_chunks_semantic:
+        video_id = chunk.metadata["video_id"]
+        # Only check chunks from the same video
+        potential_subchunks = video_id_to_chunks.get(video_id, [])
+        subchunks = [
+            c
+            for c in potential_subchunks
+            if c.page_content in chunk.page_content
+        ]
+        times = [
+            (t.metadata["time_start"], t.metadata["time_end"])
+            for t in subchunks
+        ]
+        chunk.metadata["speech_start_stop_times"] = times
+        if times:  # Avoid IndexError if times is empty
+            chunk.metadata["start"], chunk.metadata["stop"] = (
+                times[0][0],
+                times[-1][-1],
+            )
+        else:
+            chunk.metadata["start"], chunk.metadata["stop"] = None, None
+    docs_chunks_semantic[0].metadata.keys()
+    return docs_chunks_semantic

pstuts_rag/pstuts_rag/graph.py CHANGED Viewed

@@ -30,7 +30,7 @@ from app import (
     enter_chain,
 )
-from pstuts_rag.rag_for_transcripts import retrieve_videos
 def search_agent(state: PsTutsTeamState, chain: Runnable) -> Dict:
@@ -244,7 +244,7 @@ async def build_the_graph(current_state: ApplicationState):
     )
     rag_node, _ = create_rag_node(
-        rag_chain=retrieve_videos(),
         name=VIDEOARCHIVE,
     )

     enter_chain,
 )
+from pstuts_rag.rag_for_transcripts import create_transcript_rag_chain
 def search_agent(state: PsTutsTeamState, chain: Runnable) -> Dict:
     )
     rag_node, _ = create_rag_node(
+        rag_chain=create_transcript_rag_chain(),
         name=VIDEOARCHIVE,
     )

pstuts_rag/pstuts_rag/rag.py CHANGED Viewed

@@ -248,7 +248,7 @@ class RAGChainInstance:
             qdrant_client=self.qdrant_client, name=self.name
         )
         if self.datastore_manager.count_docs() == 0:
-            self.pointsLoaded = await self.datastore_manager.populate_database(
                 raw_docs=json_payload
             )
             logging.info(

             qdrant_client=self.qdrant_client, name=self.name
         )
         if self.datastore_manager.count_docs() == 0:
+            self.pointsLoaded = await self.datastore_manager.embed_chunks(
                 raw_docs=json_payload
             )
             logging.info(

pstuts_rag/pstuts_rag/rag_for_transcripts.py CHANGED Viewed

@@ -18,7 +18,7 @@ from langchain_ollama import ChatOllama
 from .datastore import DatastoreManager
 from .prompts import RAG_PROMPT_TEMPLATES
 from pstuts_rag.configuration import Configuration, ModelAPI
@@ -37,6 +37,7 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
     answer: AIMessage = msg_dict["answer"]
     input = msg_dict["input"]
     reference_dicts = [
         {k: doc.metadata[k] for k in ("title", "source", "start", "stop")}
         for doc in input["context"]
@@ -44,11 +45,13 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
     references = str(json.dumps(reference_dicts, indent=2))
     text_w_references = answer.content
     if "I don't know" not in answer.content:
         text_w_references = "\n".join(
             [str(text_w_references), "**REFERENCES**", references]
         )
     output: AIMessage = answer.model_copy(
         update={
             "content": text_w_references,
@@ -63,88 +66,60 @@ def pack_references(msg_dict: Dict[str, Any]) -> AIMessage:
     return output
-def retrieve_videos(
     datastore: DatastoreManager,
     config: Union[RunnableConfig, Configuration] = Configuration(),
 ) -> Runnable:
     configurable = (
         config
         if isinstance(config, Configuration)
         else Configuration.from_runnable_config(config)
     )
-    cls = {
-        ModelAPI.HUGGINGFACE: ChatHuggingFace,
-        ModelAPI.OPENAI: ChatOpenAI,
-        ModelAPI.OLLAMA: ChatOllama,
-    }.get(configurable.llm_api, ChatOpenAI)
     llm = cls(model=configurable.llm_tool_model)
     answer_chain = (
         ChatPromptTemplate.from_messages(list(RAG_PROMPT_TEMPLATES.items()))
         | llm
     )
     rag_chain = (
-        itemgetter("question")
-        | RunnableParallel(
             context=datastore.get_retriever(
                 n_context_docs=configurable.n_context_docs
             ),
-            question=RunnablePassthrough(),
         )
-        | {
-            "input": RunnablePassthrough(),
-            "answer": answer_chain,
         }
-        | pack_references
     )
     return rag_chain
-def startup(
-    config=Configuration(),
-    callback_on_loading_complete: Optional[Callable] = None,
-):
-    """
-    Initialize the application with optional loading completion callback.
-    Args:
-        config: Configuration object with application settings
-        on_loading_complete: Optional callback (sync or async) to call when
-                           datastore loading completes
-    Returns:
-        DatastoreManager: The initialized datastore manager
-    """
-    ### PROCESS THE CONFIGURATION
-    log_level = getattr(logging, config.eva_log_level, logging.INFO)
-    logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")
-    ### CREATE THE DATABASE
-    datastore = DatastoreManager()
-    if callback_on_loading_complete:
-        datastore.add_completion_callback(callback_on_loading_complete)
-    ### START DATABASE POPULATION
-    globs = [str(g) for g in config.transcript_glob.split(":")]
-    # # Add custom callback if provided, otherwise use default logging
-    # if on_loading_complete:
-    #     datastore.add_completion_callback(on_loading_complete)
-    # else:
-    #     # Default callback for logging
-    #     def default_logging_callback():
-    #         logging.info("🎉 Datastore loading completed!")
-    #     datastore.add_completion_callback(default_logging_callback)
-    asyncio.create_task(datastore.from_json_globs(globs))
-    return datastore

 from .datastore import DatastoreManager
 from .prompts import RAG_PROMPT_TEMPLATES
+from pstuts_rag.utils import ChatAPISelector
 from pstuts_rag.configuration import Configuration, ModelAPI
     answer: AIMessage = msg_dict["answer"]
     input = msg_dict["input"]
+    # Extract relevant metadata from each document in the context
     reference_dicts = [
         {k: doc.metadata[k] for k in ("title", "source", "start", "stop")}
         for doc in input["context"]
     references = str(json.dumps(reference_dicts, indent=2))
     text_w_references = answer.content
+    # Only append references if the model provided a substantive answer
     if "I don't know" not in answer.content:
         text_w_references = "\n".join(
             [str(text_w_references), "**REFERENCES**", references]
         )
+    # Create new message with references and preserve original context metadata
     output: AIMessage = answer.model_copy(
         update={
             "content": text_w_references,
     return output
+def create_transcript_rag_chain(
     datastore: DatastoreManager,
     config: Union[RunnableConfig, Configuration] = Configuration(),
 ) -> Runnable:
+    """Create a Retrieval-Augmented Generation (RAG) chain for video transcript search.
+    This function constructs a complete RAG pipeline that:
+    1. Takes a user question as input
+    2. Retrieves relevant video transcript chunks from the datastore
+    3. Generates an answer using an LLM with the retrieved context
+    4. Packages the response with reference information
+    Args:
+        datastore: The DatastoreManager containing video transcript embeddings
+        config: Configuration object or RunnableConfig with model and retrieval settings
+    Returns:
+        Runnable: A LangChain runnable that processes questions and returns
+                 answers with embedded references to source video segments
+    """
+    # Handle both Configuration objects and RunnableConfig dictionaries
     configurable = (
         config
         if isinstance(config, Configuration)
         else Configuration.from_runnable_config(config)
     )
+    # Select the appropriate chat model class based on configuration
+    cls = ChatAPISelector.get(configurable.llm_api, ChatOpenAI)
     llm = cls(model=configurable.llm_tool_model)
+    # Create the answer generation chain using prompt templates
     answer_chain = (
         ChatPromptTemplate.from_messages(list(RAG_PROMPT_TEMPLATES.items()))
         | llm
     )
+    # Build the complete RAG chain with the following flow:
+    # question -> parallel(context_retrieval, question_passthrough) -> llm_answer -> pack_references
     rag_chain = (
+        itemgetter("question")  # Extract question from input dict
+        | RunnableParallel(  # Run context retrieval and question passing in parallel
             context=datastore.get_retriever(
                 n_context_docs=configurable.n_context_docs
             ),
+            question=RunnablePassthrough(),  # Pass question unchanged
         )
+        | {  # Prepare input dict for final processing
+            "input": RunnablePassthrough(),  # Contains both context and question
+            "answer": answer_chain,  # Generate answer using retrieved context
         }
+        | pack_references  # Add reference metadata to the final response
     )
     return rag_chain