Spaces:
Sleeping
Sleeping
added app.py
Browse files
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import csv
|
4 |
+
import chardet
|
5 |
+
from io import StringIO, BytesIO
|
6 |
+
import base64
|
7 |
+
|
8 |
+
# Streamlit app title
|
9 |
+
st.title('ASCII characters checker')
|
10 |
+
st.text('This app will first check the csv file format.')
|
11 |
+
st.text('It will then check the CSV file for any ASCII characters and convert them into searchable strings.')
|
12 |
+
st.text('If found, download the file and search for <0x in your text editor.')
|
13 |
+
|
14 |
+
# File uploader allows user to add their own CSV
|
15 |
+
uploaded_file = st.file_uploader("Choose a CSV file", type=['csv'])
|
16 |
+
|
17 |
+
# Checkbox for user to choose encoding conversion
|
18 |
+
convert_to_utf8 = st.checkbox("Convert file to UTF-8", value=True)
|
19 |
+
|
20 |
+
if uploaded_file is not None:
|
21 |
+
# To read file as string:
|
22 |
+
bytes_data = uploaded_file.getvalue()
|
23 |
+
|
24 |
+
# Detect the encoding of the uploaded file
|
25 |
+
result = chardet.detect(bytes_data[:1024]) # Reading a sample for efficiency
|
26 |
+
original_encoding = result['encoding']
|
27 |
+
st.write(f"Detected encoding: {original_encoding}") # Display the detected encoding
|
28 |
+
|
29 |
+
# If the user wants to convert to UTF-8 and the file is not already UTF-8, convert it
|
30 |
+
if convert_to_utf8 and original_encoding != 'utf-8':
|
31 |
+
string_data = bytes_data.decode(original_encoding) # Decode using the detected encoding
|
32 |
+
bytes_data = string_data.encode('utf-8') # Encode to UTF-8
|
33 |
+
elif not convert_to_utf8:
|
34 |
+
# If user does not want to convert to UTF-8, ensure it's in ASCII
|
35 |
+
string_data = bytes_data.decode(original_encoding, errors='replace') # Decode with replacement
|
36 |
+
bytes_data = string_data.encode('ascii', errors='replace') # Encode to ASCII with replacement for non-ASCII characters
|
37 |
+
|
38 |
+
# Convert bytes data to a StringIO object for pandas
|
39 |
+
string_io_data = StringIO(bytes_data.decode('utf-8' if convert_to_utf8 else 'ascii'))
|
40 |
+
|
41 |
+
# Define a mapping of non-printable ASCII characters to their visible equivalents
|
42 |
+
ascii_replacements = {
|
43 |
+
i: f'<0x{i:02X}>' for i in range(128) if not chr(i).isprintable()
|
44 |
+
}
|
45 |
+
|
46 |
+
# Function to replace non-printable ASCII characters
|
47 |
+
def replace_ascii_chars(s):
|
48 |
+
if isinstance(s, str):
|
49 |
+
return ''.join(ascii_replacements.get(ord(char), char) for char in s)
|
50 |
+
elif pd.isnull(s):
|
51 |
+
return s # Return as is if the value is NaN
|
52 |
+
else:
|
53 |
+
return str(s) # Convert non-string, non-null values to string
|
54 |
+
|
55 |
+
# Read the CSV file into a pandas DataFrame
|
56 |
+
df = pd.read_csv(string_io_data, sep=',')
|
57 |
+
|
58 |
+
# Apply the function to every cell in the DataFrame
|
59 |
+
cleaned_df = df.applymap(replace_ascii_chars)
|
60 |
+
|
61 |
+
# Check if ASCII characters were found and replaced
|
62 |
+
ascii_found = any('<0x' in str(cell) for row in cleaned_df.values for cell in row)
|
63 |
+
if ascii_found:
|
64 |
+
# Display a message indicating that ASCII characters were found and replaced
|
65 |
+
st.info('ASCII characters were found and have been replaced with searchable strings.')
|
66 |
+
else:
|
67 |
+
# Optionally, display a message if no ASCII characters were found
|
68 |
+
st.write('No ASCII characters were found in the file.')
|
69 |
+
|
70 |
+
# Convert DataFrame to CSV for download
|
71 |
+
output_csv = cleaned_df.to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8' if convert_to_utf8 else 'ascii')
|
72 |
+
|
73 |
+
# Create a link for downloading the cleaned CSV
|
74 |
+
b64 = base64.b64encode(output_csv.encode()).decode() # some strings <-> bytes conversions necessary here
|
75 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_file.csv">Download cleaned CSV file</a>'
|
76 |
+
st.markdown(href, unsafe_allow_html=True)
|
77 |
+
|
78 |
+
st.write('File cleaned and ready for download.')
|