tasal9's picture
Add setup.py
f02797c verified
raw
history blame
2.86 kB
"""
ZamAI Embeddings Model Setup
This script sets up the Multilingual ZamAI Embeddings model and vector database.
"""
import os
import chromadb
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.readers.file import SimpleDirectoryReader
def setup_embedding_model(corpus_path="data/text_corpus/",
db_path="./models/embeddings/chroma_db",
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
"""
Set up the embedding model and vector database for multilingual document retrieval.
Args:
corpus_path: Path to the text corpus directory
db_path: Path where the ChromaDB database will be stored
model_name: Name of the HuggingFace embedding model to use
Returns:
query_engine: A query engine for searching the indexed documents
"""
# Ensure directories exist
os.makedirs(corpus_path, exist_ok=True)
os.makedirs(os.path.dirname(db_path), exist_ok=True)
# Load documents if corpus directory has files
if os.listdir(corpus_path):
text_docs = SimpleDirectoryReader(corpus_path).load_data()
else:
print(f"Warning: No documents found in {corpus_path}")
text_docs = []
# Initialize embedding model
embed_model = HuggingFaceEmbedding(model_name=model_name)
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path=db_path)
collection = chroma_client.get_or_create_collection("zamAI_collection")
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Build or load index if we have documents
if text_docs:
index = VectorStoreIndex.from_documents(
text_docs, storage_context=storage_context, embed_model=embed_model
)
else:
# If no documents yet, just initialize the index with the embedding model
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
embed_model=embed_model,
storage_context=storage_context
)
# Create a query engine
query_engine = index.as_query_engine()
return {
"index": index,
"query_engine": query_engine,
"embed_model": embed_model,
"vector_store": vector_store
}
if __name__ == "__main__":
# Example usage
embedding_components = setup_embedding_model()
print("Embedding model and vector store setup complete!")
print("You can now use the embedding_components['query_engine'] to search your documents.")