vikramvasudevan's picture
Upload folder using huggingface_hub
5f051a0 verified
raw
history blame
2.92 kB
import chromadb
from chromadb.config import Settings
import json
import csv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
class MyDatabase:
def __init__(self):
# Settings(persist_directory="./chroma_db")
self.chroma_client = chromadb.Client()
self.initialize()
def get_collection(self):
return self.chroma_client.get_or_create_collection(name="bhagavat_gita")
def initialize(self):
print("Adding Data ...")
collection = self.get_collection()
# Read CSV data into a list of dictionaries
print("Loading Bhagavat Gita ...")
with open(
"./data/gita_data.csv", mode="r", newline="", encoding="utf-8"
) as csvfile:
documents = list(csv.DictReader(csvfile))
# with open("./gita_data.json", "r") as f:
# documents = json.load(f)
with open("./data/gita_data_new.json", "w") as f:
json.dump(documents, f, indent=1)
collection.add(
documents=[document["translation"] for document in documents],
metadatas=[
{
"source": "bhagavat_gita",
"chapter_number": document["chapter_number"],
"verse_number": document["chapter_verse"],
}
for document in documents
],
# [
# {"source": "article1"},
# {"source": "article2"},
# {"source": "article3"},
# ],
# ids=["doc1", "doc2", "doc3"],
ids=[f"doc{i}" for i, document in enumerate(documents)],
)
# print("Loading Vishnu Puranam ...")
# loader = PyPDFLoader("./data/vishnu_puranam.pdf")
# pdfDocument = loader.load()
# print("pdfDocument", pdfDocument)
# with open("./data/vishnu_puranam.json","w") as f:
# json.dump([doc.model_dump_json() for doc in pdfDocument], f, indent=1)
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
# chunked_documents = text_splitter.split_documents([pdfDocument])
# print(chunked_documents)
print("Added data ...")
def get_data(self, query: str = "is knowledge superior to action?"):
print("Querying data ...")
collection = self.get_collection()
results = collection.query(
query_texts=[
query,
], # Chroma will embed this for you
n_results=5, # how many results to return
)
print(json.dumps(results, indent=2))
return results
# mydb = MyDatabase()
# mydb.initialize()
# mydb.get_data("What is karma?")