Spaces:

gufett0
/

chatbot-llamaindex

Sleeping

App Files Files Community

gufett0 commited on Sep 18, 2024

Commit

3968a50

1 Parent(s): bf1cde5

switched back to langchain

Browse files

Files changed (1) hide show

backend2.py +24 -11

backend2.py CHANGED Viewed

@@ -59,38 +59,51 @@ def prepare_documents(documents):
     logger.debug("Preparing documents for embedding.")
     start_time = time.time()
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-    # It splits text into chunks of 1000 characters each with a 150-character overlap.
-    #text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     texts = text_splitter.create_documents([doc["content"] for doc in documents], metadatas=[{"source": os.path.basename(doc["source"])} for doc in documents])
     if not texts:
-        logger.error("No texts to embed.")
         return None
     modelPath = "sentence-transformers/all-MiniLM-l6-v2"
     model_kwargs = {'device': device}
     encode_kwargs = {'normalize_embeddings': False}
-    embeddings = HuggingFaceEmbeddings(model_name=modelPath, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs )
     try:
         db = FAISS.from_documents(texts, embeddings)
     except Exception as e:
-        logger.error("Error creating FAISS index: %s", e)
         return None
     end_time = time.time()
-    logger.debug("Documents prepared in %.2f seconds.", end_time - start_time)
     return db
 def get_context_sources(question, db):
     start_time = time.time()
-    docs = db.similarity_search(question, k=3)
-    context = " ".join([doc.page_content for doc in docs])
-    sources = ", ".join(set([doc.metadata['source'] for doc in docs]))
     end_time = time.time()
-    logger.debug("Similarity search done in %.2f seconds.", end_time - start_time)
     return context, sources

     logger.debug("Preparing documents for embedding.")
     start_time = time.time()
+    if not documents:
+        logger.error("No documents to prepare.")
+        return None
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     texts = text_splitter.create_documents([doc["content"] for doc in documents], metadatas=[{"source": os.path.basename(doc["source"])} for doc in documents])
     if not texts:
+        logger.error("No texts to embed after splitting.")
         return None
+    logger.debug(f"Created {len(texts)} text chunks.")
     modelPath = "sentence-transformers/all-MiniLM-l6-v2"
     model_kwargs = {'device': device}
     encode_kwargs = {'normalize_embeddings': False}
+    embeddings = HuggingFaceEmbeddings(model_name=modelPath, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
     try:
         db = FAISS.from_documents(texts, embeddings)
+        logger.debug("FAISS index created successfully.")
     except Exception as e:
+        logger.error(f"Error creating FAISS index: {e}")
         return None
     end_time = time.time()
+    logger.debug(f"Documents prepared in {end_time - start_time:.2f} seconds.")
     return db
 def get_context_sources(question, db):
     start_time = time.time()
+    if db is None:
+        logger.error("Database is None. Cannot perform similarity search.")
+        return "", ""
+    try:
+        docs = db.similarity_search(question, k=3)
+        context = " ".join([doc.page_content for doc in docs])
+        sources = ", ".join(set([doc.metadata['source'] for doc in docs]))
+    except Exception as e:
+        logger.error(f"Error during similarity search: {e}")
+        return "", ""
     end_time = time.time()
+    logger.debug(f"Similarity search done in {end_time - start_time:.2f} seconds.")
     return context, sources