Spaces:

Priyadharshana
/

Your_legal_chatbot

Running

App Files Files Community

Priyadharshana commited on Mar 23

Commit

55097f7

verified ·

1 Parent(s): d0b0b99

Upload Ingest.py

Browse files

Files changed (1) hide show

Ingest.py +61 -0

Ingest.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import ray
+import logging
+from langchain_community.document_loaders import DirectoryLoader
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from faiss import IndexFlatL2  # Assuming using L2 distance for simplicity
+# Initialize Ray
+ray.init()
+# Set up basic configuration for logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Load documents with logging
+logging.info("Loading documents...")
+loader = DirectoryLoader('data', glob="./*.txt")
+documents = loader.load()
+# Extract text from documents and split into manageable texts with logging
+logging.info("Extracting and splitting texts from documents...")
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
+texts = []
+for document in documents:
+    if hasattr(document, 'get_text'):
+        text_content = document.get_text()  # Adjust according to actual method
+    else:
+        text_content = ""  # Default to empty string if no text method is available
+    texts.extend(text_splitter.split_text(text_content))
+# Define embedding function
+def embedding_function(text):
+    embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
+    return embeddings_model.embed_query(text)
+# Create FAISS index for embeddings
+index = IndexFlatL2(768)  # Dimension of embeddings, adjust as needed
+# Assuming docstore as a simple dictionary to store document texts
+docstore = {i: text for i, text in enumerate(texts)}
+index_to_docstore_id = {i: i for i in range(len(texts))}
+# Initialize FAISS
+faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
+# Process and store embeddings
+logging.info("Storing embeddings in FAISS...")
+for i, text in enumerate(texts):
+    embedding = embedding_function(text)
+    faiss_db.add_documents([embedding])
+# Exporting the vector embeddings database with logging
+logging.info("Exporting the vector embeddings database...")
+faiss_db.save_local("ipc_embed_db")
+# Log a message to indicate the completion of the process
+logging.info("Process completed successfully.")
+# Shutdown Ray after the process
+ray.shutdown()