tasal9
/

Multilingual-ZamAI-Embeddings

Sentence Similarity

sentence-transformers

feature-extraction

semantic-search

text-embeddings-inference

Model card Files Files and versions

Multilingual-ZamAI-Embeddings / setup.py

tasal9's picture

Add setup.py

f02797c verified 4 months ago

2.86 kB

	"""
	ZamAI Embeddings Model Setup
	This script sets up the Multilingual ZamAI Embeddings model and vector database.
	"""
	import os
	import chromadb
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.vector_stores.chroma import ChromaVectorStore
	from llama_index.core import StorageContext, VectorStoreIndex
	from llama_index.readers.file import SimpleDirectoryReader

	def setup_embedding_model(corpus_path="data/text_corpus/",
	db_path="./models/embeddings/chroma_db",
	model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
	"""
	Set up the embedding model and vector database for multilingual document retrieval.

	Args:
	corpus_path: Path to the text corpus directory
	db_path: Path where the ChromaDB database will be stored
	model_name: Name of the HuggingFace embedding model to use

	Returns:
	query_engine: A query engine for searching the indexed documents
	"""
	# Ensure directories exist
	os.makedirs(corpus_path, exist_ok=True)
	os.makedirs(os.path.dirname(db_path), exist_ok=True)

	# Load documents if corpus directory has files
	if os.listdir(corpus_path):
	text_docs = SimpleDirectoryReader(corpus_path).load_data()
	else:
	print(f"Warning: No documents found in {corpus_path}")
	text_docs = []

	# Initialize embedding model
	embed_model = HuggingFaceEmbedding(model_name=model_name)

	# Initialize ChromaDB
	chroma_client = chromadb.PersistentClient(path=db_path)
	collection = chroma_client.get_or_create_collection("zamAI_collection")
	vector_store = ChromaVectorStore(chroma_collection=collection)
	storage_context = StorageContext.from_defaults(vector_store=vector_store)

	# Build or load index if we have documents
	if text_docs:
	index = VectorStoreIndex.from_documents(
	text_docs, storage_context=storage_context, embed_model=embed_model
	)
	else:
	# If no documents yet, just initialize the index with the embedding model
	index = VectorStoreIndex.from_vector_store(
	vector_store=vector_store,
	embed_model=embed_model,
	storage_context=storage_context
	)

	# Create a query engine
	query_engine = index.as_query_engine()

	return {
	"index": index,
	"query_engine": query_engine,
	"embed_model": embed_model,
	"vector_store": vector_store
	}

	if __name__ == "__main__":
	# Example usage
	embedding_components = setup_embedding_model()
	print("Embedding model and vector store setup complete!")
	print("You can now use the embedding_components['query_engine'] to search your documents.")