gufett0 commited on
Commit
3968a50
·
1 Parent(s): bf1cde5

switched back to langchain

Browse files
Files changed (1) hide show
  1. backend2.py +24 -11
backend2.py CHANGED
@@ -59,38 +59,51 @@ def prepare_documents(documents):
59
  logger.debug("Preparing documents for embedding.")
60
  start_time = time.time()
61
 
 
 
 
 
62
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
63
- # It splits text into chunks of 1000 characters each with a 150-character overlap.
64
- #text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
65
  texts = text_splitter.create_documents([doc["content"] for doc in documents], metadatas=[{"source": os.path.basename(doc["source"])} for doc in documents])
 
66
  if not texts:
67
- logger.error("No texts to embed.")
68
  return None
69
 
 
 
70
  modelPath = "sentence-transformers/all-MiniLM-l6-v2"
71
  model_kwargs = {'device': device}
72
  encode_kwargs = {'normalize_embeddings': False}
73
- embeddings = HuggingFaceEmbeddings(model_name=modelPath, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs )
74
 
75
  try:
76
  db = FAISS.from_documents(texts, embeddings)
 
77
  except Exception as e:
78
- logger.error("Error creating FAISS index: %s", e)
79
  return None
80
 
81
  end_time = time.time()
82
- logger.debug("Documents prepared in %.2f seconds.", end_time - start_time)
83
  return db
84
 
85
-
86
  def get_context_sources(question, db):
87
  start_time = time.time()
88
 
89
- docs = db.similarity_search(question, k=3)
90
- context = " ".join([doc.page_content for doc in docs])
91
- sources = ", ".join(set([doc.metadata['source'] for doc in docs]))
 
 
 
 
 
 
 
 
92
 
93
  end_time = time.time()
94
- logger.debug("Similarity search done in %.2f seconds.", end_time - start_time)
95
 
96
  return context, sources
 
59
  logger.debug("Preparing documents for embedding.")
60
  start_time = time.time()
61
 
62
+ if not documents:
63
+ logger.error("No documents to prepare.")
64
+ return None
65
+
66
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
 
 
67
  texts = text_splitter.create_documents([doc["content"] for doc in documents], metadatas=[{"source": os.path.basename(doc["source"])} for doc in documents])
68
+
69
  if not texts:
70
+ logger.error("No texts to embed after splitting.")
71
  return None
72
 
73
+ logger.debug(f"Created {len(texts)} text chunks.")
74
+
75
  modelPath = "sentence-transformers/all-MiniLM-l6-v2"
76
  model_kwargs = {'device': device}
77
  encode_kwargs = {'normalize_embeddings': False}
78
+ embeddings = HuggingFaceEmbeddings(model_name=modelPath, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
79
 
80
  try:
81
  db = FAISS.from_documents(texts, embeddings)
82
+ logger.debug("FAISS index created successfully.")
83
  except Exception as e:
84
+ logger.error(f"Error creating FAISS index: {e}")
85
  return None
86
 
87
  end_time = time.time()
88
+ logger.debug(f"Documents prepared in {end_time - start_time:.2f} seconds.")
89
  return db
90
 
 
91
  def get_context_sources(question, db):
92
  start_time = time.time()
93
 
94
+ if db is None:
95
+ logger.error("Database is None. Cannot perform similarity search.")
96
+ return "", ""
97
+
98
+ try:
99
+ docs = db.similarity_search(question, k=3)
100
+ context = " ".join([doc.page_content for doc in docs])
101
+ sources = ", ".join(set([doc.metadata['source'] for doc in docs]))
102
+ except Exception as e:
103
+ logger.error(f"Error during similarity search: {e}")
104
+ return "", ""
105
 
106
  end_time = time.time()
107
+ logger.debug(f"Similarity search done in {end_time - start_time:.2f} seconds.")
108
 
109
  return context, sources