Priyadharshana commited on
Commit
55097f7
·
verified ·
1 Parent(s): d0b0b99

Upload Ingest.py

Browse files
Files changed (1) hide show
  1. Ingest.py +61 -0
Ingest.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ray
2
+ import logging
3
+ from langchain_community.document_loaders import DirectoryLoader
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_community.vectorstores import FAISS
7
+ from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity
8
+
9
+ # Initialize Ray
10
+ ray.init()
11
+
12
+ # Set up basic configuration for logging
13
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
+
15
+ # Load documents with logging
16
+ logging.info("Loading documents...")
17
+ loader = DirectoryLoader('data', glob="./*.txt")
18
+ documents = loader.load()
19
+
20
+ # Extract text from documents and split into manageable texts with logging
21
+ logging.info("Extracting and splitting texts from documents...")
22
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
23
+ texts = []
24
+ for document in documents:
25
+ if hasattr(document, 'get_text'):
26
+ text_content = document.get_text() # Adjust according to actual method
27
+ else:
28
+ text_content = "" # Default to empty string if no text method is available
29
+
30
+ texts.extend(text_splitter.split_text(text_content))
31
+
32
+ # Define embedding function
33
+ def embedding_function(text):
34
+ embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
35
+ return embeddings_model.embed_query(text)
36
+
37
+ # Create FAISS index for embeddings
38
+ index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
39
+
40
+ # Assuming docstore as a simple dictionary to store document texts
41
+ docstore = {i: text for i, text in enumerate(texts)}
42
+ index_to_docstore_id = {i: i for i in range(len(texts))}
43
+
44
+ # Initialize FAISS
45
+ faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
46
+
47
+ # Process and store embeddings
48
+ logging.info("Storing embeddings in FAISS...")
49
+ for i, text in enumerate(texts):
50
+ embedding = embedding_function(text)
51
+ faiss_db.add_documents([embedding])
52
+
53
+ # Exporting the vector embeddings database with logging
54
+ logging.info("Exporting the vector embeddings database...")
55
+ faiss_db.save_local("ipc_embed_db")
56
+
57
+ # Log a message to indicate the completion of the process
58
+ logging.info("Process completed successfully.")
59
+
60
+ # Shutdown Ray after the process
61
+ ray.shutdown()