samim2024 commited on
Commit
9bb73f5
·
verified ·
1 Parent(s): c66d3b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -8
app.py CHANGED
@@ -5,11 +5,14 @@
5
  import streamlit as st
6
  import requests
7
  from bs4 import BeautifulSoup
8
- from langchain.document_loaders import TextLoader #reads in a file as text and places it all into one document.
9
- from langchain.indexes import VectorstoreIndexCreator #Logic for creating indexes.
10
- from langchain.vectorstores import DocArrayInMemorySearch #document index provided by Docarray that stores documents in memory.
11
  from sentence_transformers import SentenceTransformer
12
  from langchain_community.llms import HuggingFaceEndpoint
 
 
 
 
13
 
14
  #import vertexai
15
  #from langchain.llms import VertexAI
@@ -52,11 +55,21 @@ def create_langchain_index(input_text):
52
  print("--indexing---")
53
  get_text(input_text)
54
  loader = TextLoader("text\\temp.txt", encoding='utf-8')
55
- data = loader.load()
56
- data = str(data)
57
- embeddings = model.encode(data)
58
- index = VectorstoreIndexCreator(vectorstore_cls=DocArrayInMemorySearch,embedding=embeddings).from_loaders([loader])
59
- return index
 
 
 
 
 
 
 
 
 
 
60
 
61
  # @st.cache_resource
62
  # def get_basic_page_details(input_text,summary_query,tweet_query,ln_query):
 
5
  import streamlit as st
6
  import requests
7
  from bs4 import BeautifulSoup
8
+ #from langchain.indexes import VectorstoreIndexCreator #Logic for creating indexes.
9
+ #from langchain.vectorstores import DocArrayInMemorySearch #document index provided by Docarray that stores documents in memory.
 
10
  from sentence_transformers import SentenceTransformer
11
  from langchain_community.llms import HuggingFaceEndpoint
12
+ from langchain_chroma import Chroma
13
+ from langchain_community.document_loaders import TextLoader
14
+ from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
15
+ from langchain_text_splitters import CharacterTextSplitter
16
 
17
  #import vertexai
18
  #from langchain.llms import VertexAI
 
55
  print("--indexing---")
56
  get_text(input_text)
57
  loader = TextLoader("text\\temp.txt", encoding='utf-8')
58
+ documents = loader.load()
59
+ # split it into chunks
60
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
61
+ docs = text_splitter.split_documents(documents)
62
+ print(docs)
63
+ # create the open-source embedding function
64
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
65
+ # load it into Chroma
66
+ db = Chroma.from_documents(docs, embeddings)
67
+ persist_directory = "chroma_db"
68
+ vectordb = Chroma.from_documents(
69
+ documents=docs, embedding=embeddings, persist_directory=persist_directory
70
+ )
71
+ new_db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
72
+ return new_db
73
 
74
  # @st.cache_resource
75
  # def get_basic_page_details(input_text,summary_query,tweet_query,ln_query):