Spaces:

rockerritesh
/

Chat

Sleeping

rockerritesh commited on Sep 22, 2024

Commit

55743e6

verified ·

1 Parent(s): 819112e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,19 +1,22 @@
 import streamlit as st
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# Streamlit sidebar for file upload
 st.sidebar.title("Upload your text file")
 uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"])
 if uploaded_file:
     # Read the text file content
     text_data = uploaded_file.read().decode("utf-8")
-    # Split the text into sentences
-    sentences = text_data.split('\n')
     # Initialize the TF-IDF Vectorizer
     vectorizer = TfidfVectorizer().fit(sentences)
     vectors = vectorizer.transform(sentences)  # Keep it sparse
@@ -28,7 +31,7 @@ if uploaded_file:
         # Get indices of top N similar sentences
         top_indices = similarities.argsort()[-top_n:][::-1]
-        # Return top N most similar sentences
         return [sentences[i] for i in top_indices]
     # Streamlit chat elements

 import streamlit as st
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+import textwrap
+# Streamlit sidebar for file upload and chunk size slider
 st.sidebar.title("Upload your text file")
 uploaded_file = st.sidebar.file_uploader("Choose a text file", type=["txt"])
+# Slider for chunk size selection
+chunk_size = st.sidebar.slider("Select chunk size", min_value=100, max_value=500, step=100, value=300)
 if uploaded_file:
     # Read the text file content
     text_data = uploaded_file.read().decode("utf-8")
+    # Split the text into chunks based on the selected chunk size
+    sentences = textwrap.wrap(text_data, chunk_size)
     # Initialize the TF-IDF Vectorizer
     vectorizer = TfidfVectorizer().fit(sentences)
     vectors = vectorizer.transform(sentences)  # Keep it sparse
         # Get indices of top N similar sentences
         top_indices = similarities.argsort()[-top_n:][::-1]
+        # Return top N most similar chunks
         return [sentences[i] for i in top_indices]
     # Streamlit chat elements