import streamlit as st | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
from langchain_community.vectorstores import InMemoryVectorStore | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
def load_embedding_model(model): | |
model = HuggingFaceEmbeddings(model_name=model) | |
return model | |
def load_vector_store(): | |
model = load_embedding_model("sentence-transformers/all-MiniLM-L12-v2") | |
vector_store = Chroma( | |
collection_name="main_store", | |
embedding_function=model, | |
persist_directory="./chroma", | |
) | |
return vector_store | |
def process_pdf(pdf, vector_store): | |
""" | |
Loads a pdf and splits it into chunks | |
""" | |
loader = PyPDFLoader(pdf) | |
docs = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
splits = text_splitter.split_documents(docs) | |
vector_store.add_documents(splits) | |