Spaces:
Sleeping
Sleeping
import chromadb | |
from chromadb.config import Settings | |
import json | |
import csv | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_community.embeddings import OpenAIEmbeddings | |
class MyDatabase: | |
def __init__(self): | |
# Settings(persist_directory="./chroma_db") | |
self.chroma_client = chromadb.Client() | |
self.initialize() | |
def get_collection(self): | |
return self.chroma_client.get_or_create_collection(name="bhagavat_gita") | |
def initialize(self): | |
print("Adding Data ...") | |
collection = self.get_collection() | |
# Read CSV data into a list of dictionaries | |
print("Loading Bhagavat Gita ...") | |
with open( | |
"./data/gita_data.csv", mode="r", newline="", encoding="utf-8" | |
) as csvfile: | |
documents = list(csv.DictReader(csvfile)) | |
# with open("./gita_data.json", "r") as f: | |
# documents = json.load(f) | |
with open("./data/gita_data_new.json", "w") as f: | |
json.dump(documents, f, indent=1) | |
collection.add( | |
documents=[document["translation"] for document in documents], | |
metadatas=[ | |
{ | |
"source": "bhagavat_gita", | |
"chapter_number": document["chapter_number"], | |
"verse_number": document["chapter_verse"], | |
} | |
for document in documents | |
], | |
# [ | |
# {"source": "article1"}, | |
# {"source": "article2"}, | |
# {"source": "article3"}, | |
# ], | |
# ids=["doc1", "doc2", "doc3"], | |
ids=[f"doc{i}" for i, document in enumerate(documents)], | |
) | |
# print("Loading Vishnu Puranam ...") | |
# loader = PyPDFLoader("./data/vishnu_puranam.pdf") | |
# pdfDocument = loader.load() | |
# print("pdfDocument", pdfDocument) | |
# with open("./data/vishnu_puranam.json","w") as f: | |
# json.dump([doc.model_dump_json() for doc in pdfDocument], f, indent=1) | |
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) | |
# chunked_documents = text_splitter.split_documents([pdfDocument]) | |
# print(chunked_documents) | |
print("Added data ...") | |
def get_data(self, query: str = "is knowledge superior to action?"): | |
print("Querying data ...") | |
collection = self.get_collection() | |
results = collection.query( | |
query_texts=[ | |
query, | |
], # Chroma will embed this for you | |
n_results=5, # how many results to return | |
) | |
print(json.dumps(results, indent=2)) | |
return results | |
# mydb = MyDatabase() | |
# mydb.initialize() | |
# mydb.get_data("What is karma?") | |