rockerritesh commited on
Commit
55743e6
·
verified ·
1 Parent(s): 819112e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -1,19 +1,22 @@
1
  import streamlit as st
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
 
4
 
5
- # Streamlit sidebar for file upload
6
  st.sidebar.title("Upload your text file")
7
  uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"])
8
 
 
 
9
 
10
  if uploaded_file:
11
  # Read the text file content
12
  text_data = uploaded_file.read().decode("utf-8")
13
 
14
- # Split the text into sentences
15
- sentences = text_data.split('\n')
16
-
17
  # Initialize the TF-IDF Vectorizer
18
  vectorizer = TfidfVectorizer().fit(sentences)
19
  vectors = vectorizer.transform(sentences) # Keep it sparse
@@ -28,7 +31,7 @@ if uploaded_file:
28
  # Get indices of top N similar sentences
29
  top_indices = similarities.argsort()[-top_n:][::-1]
30
 
31
- # Return top N most similar sentences
32
  return [sentences[i] for i in top_indices]
33
 
34
  # Streamlit chat elements
 
1
  import streamlit as st
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
+ import textwrap
5
 
6
+ # Streamlit sidebar for file upload and chunk size slider
7
  st.sidebar.title("Upload your text file")
8
  uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"])
9
 
10
+ # Slider for chunk size selection
11
+ chunk_size = st.sidebar.slider("Select chunk size", min_value=100, max_value=500, step=100, value=300)
12
 
13
  if uploaded_file:
14
  # Read the text file content
15
  text_data = uploaded_file.read().decode("utf-8")
16
 
17
+ # Split the text into chunks based on the selected chunk size
18
+ sentences = textwrap.wrap(text_data, chunk_size)
19
+
20
  # Initialize the TF-IDF Vectorizer
21
  vectorizer = TfidfVectorizer().fit(sentences)
22
  vectors = vectorizer.transform(sentences) # Keep it sparse
 
31
  # Get indices of top N similar sentences
32
  top_indices = similarities.argsort()[-top_n:][::-1]
33
 
34
+ # Return top N most similar chunks
35
  return [sentences[i] for i in top_indices]
36
 
37
  # Streamlit chat elements