Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1 |
import streamlit as st
|
2 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
4 |
|
5 |
-
# Streamlit sidebar for file upload
|
6 |
st.sidebar.title("Upload your text file")
|
7 |
uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"])
|
8 |
|
|
|
|
|
9 |
|
10 |
if uploaded_file:
|
11 |
# Read the text file content
|
12 |
text_data = uploaded_file.read().decode("utf-8")
|
13 |
|
14 |
-
# Split the text into
|
15 |
-
sentences =
|
16 |
-
|
17 |
# Initialize the TF-IDF Vectorizer
|
18 |
vectorizer = TfidfVectorizer().fit(sentences)
|
19 |
vectors = vectorizer.transform(sentences) # Keep it sparse
|
@@ -28,7 +31,7 @@ if uploaded_file:
|
|
28 |
# Get indices of top N similar sentences
|
29 |
top_indices = similarities.argsort()[-top_n:][::-1]
|
30 |
|
31 |
-
# Return top N most similar
|
32 |
return [sentences[i] for i in top_indices]
|
33 |
|
34 |
# Streamlit chat elements
|
|
|
1 |
import streamlit as st
|
2 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
import textwrap
|
5 |
|
6 |
+
# Streamlit sidebar for file upload and chunk size slider
|
7 |
st.sidebar.title("Upload your text file")
|
8 |
uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"])
|
9 |
|
10 |
+
# Slider for chunk size selection
|
11 |
+
chunk_size = st.sidebar.slider("Select chunk size", min_value=100, max_value=500, step=100, value=300)
|
12 |
|
13 |
if uploaded_file:
|
14 |
# Read the text file content
|
15 |
text_data = uploaded_file.read().decode("utf-8")
|
16 |
|
17 |
+
# Split the text into chunks based on the selected chunk size
|
18 |
+
sentences = textwrap.wrap(text_data, chunk_size)
|
19 |
+
|
20 |
# Initialize the TF-IDF Vectorizer
|
21 |
vectorizer = TfidfVectorizer().fit(sentences)
|
22 |
vectors = vectorizer.transform(sentences) # Keep it sparse
|
|
|
31 |
# Get indices of top N similar sentences
|
32 |
top_indices = similarities.argsort()[-top_n:][::-1]
|
33 |
|
34 |
+
# Return top N most similar chunks
|
35 |
return [sentences[i] for i in top_indices]
|
36 |
|
37 |
# Streamlit chat elements
|