|
import chromadb |
|
from chromadb.config import Settings |
|
import json |
|
import csv |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain_community.embeddings import OpenAIEmbeddings |
|
|
|
|
|
class MyDatabase: |
|
def __init__(self): |
|
|
|
self.chroma_client = chromadb.Client() |
|
self.initialize() |
|
|
|
def get_collection(self): |
|
return self.chroma_client.get_or_create_collection(name="bhagavat_gita") |
|
|
|
def initialize(self): |
|
print("Adding Data ...") |
|
collection = self.get_collection() |
|
|
|
print("Loading Bhagavat Gita ...") |
|
with open( |
|
"./data/gita_data.csv", mode="r", newline="", encoding="utf-8" |
|
) as csvfile: |
|
documents = list(csv.DictReader(csvfile)) |
|
|
|
|
|
with open("./data/gita_data_new.json", "w") as f: |
|
json.dump(documents, f, indent=1) |
|
collection.add( |
|
documents=[document["translation"] for document in documents], |
|
metadatas=[ |
|
{ |
|
"source": "bhagavat_gita", |
|
"chapter_number": document["chapter_number"], |
|
"verse_number": document["chapter_verse"], |
|
} |
|
for document in documents |
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
ids=[f"doc{i}" for i, document in enumerate(documents)], |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Added data ...") |
|
|
|
def get_data(self, query: str = "is knowledge superior to action?"): |
|
print("Querying data ...") |
|
collection = self.get_collection() |
|
results = collection.query( |
|
query_texts=[ |
|
query, |
|
], |
|
n_results=5, |
|
) |
|
print(json.dumps(results, indent=2)) |
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|