arshadvayani commited on
Commit
0b78c35
·
verified ·
1 Parent(s): 327d53b

added app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import csv
4
+ import chardet
5
+ from io import StringIO, BytesIO
6
+ import base64
7
+
8
+ # Streamlit app title
9
+ st.title('ASCII characters checker')
10
+ st.text('This app will first check the csv file format.')
11
+ st.text('It will then check the CSV file for any ASCII characters and convert them into searchable strings.')
12
+ st.text('If found, download the file and search for <0x in your text editor.')
13
+
14
+ # File uploader allows user to add their own CSV
15
+ uploaded_file = st.file_uploader("Choose a CSV file", type=['csv'])
16
+
17
+ # Checkbox for user to choose encoding conversion
18
+ convert_to_utf8 = st.checkbox("Convert file to UTF-8", value=True)
19
+
20
+ if uploaded_file is not None:
21
+ # To read file as string:
22
+ bytes_data = uploaded_file.getvalue()
23
+
24
+ # Detect the encoding of the uploaded file
25
+ result = chardet.detect(bytes_data[:1024]) # Reading a sample for efficiency
26
+ original_encoding = result['encoding']
27
+ st.write(f"Detected encoding: {original_encoding}") # Display the detected encoding
28
+
29
+ # If the user wants to convert to UTF-8 and the file is not already UTF-8, convert it
30
+ if convert_to_utf8 and original_encoding != 'utf-8':
31
+ string_data = bytes_data.decode(original_encoding) # Decode using the detected encoding
32
+ bytes_data = string_data.encode('utf-8') # Encode to UTF-8
33
+ elif not convert_to_utf8:
34
+ # If user does not want to convert to UTF-8, ensure it's in ASCII
35
+ string_data = bytes_data.decode(original_encoding, errors='replace') # Decode with replacement
36
+ bytes_data = string_data.encode('ascii', errors='replace') # Encode to ASCII with replacement for non-ASCII characters
37
+
38
+ # Convert bytes data to a StringIO object for pandas
39
+ string_io_data = StringIO(bytes_data.decode('utf-8' if convert_to_utf8 else 'ascii'))
40
+
41
+ # Define a mapping of non-printable ASCII characters to their visible equivalents
42
+ ascii_replacements = {
43
+ i: f'<0x{i:02X}>' for i in range(128) if not chr(i).isprintable()
44
+ }
45
+
46
+ # Function to replace non-printable ASCII characters
47
+ def replace_ascii_chars(s):
48
+ if isinstance(s, str):
49
+ return ''.join(ascii_replacements.get(ord(char), char) for char in s)
50
+ elif pd.isnull(s):
51
+ return s # Return as is if the value is NaN
52
+ else:
53
+ return str(s) # Convert non-string, non-null values to string
54
+
55
+ # Read the CSV file into a pandas DataFrame
56
+ df = pd.read_csv(string_io_data, sep=',')
57
+
58
+ # Apply the function to every cell in the DataFrame
59
+ cleaned_df = df.applymap(replace_ascii_chars)
60
+
61
+ # Check if ASCII characters were found and replaced
62
+ ascii_found = any('<0x' in str(cell) for row in cleaned_df.values for cell in row)
63
+ if ascii_found:
64
+ # Display a message indicating that ASCII characters were found and replaced
65
+ st.info('ASCII characters were found and have been replaced with searchable strings.')
66
+ else:
67
+ # Optionally, display a message if no ASCII characters were found
68
+ st.write('No ASCII characters were found in the file.')
69
+
70
+ # Convert DataFrame to CSV for download
71
+ output_csv = cleaned_df.to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8' if convert_to_utf8 else 'ascii')
72
+
73
+ # Create a link for downloading the cleaned CSV
74
+ b64 = base64.b64encode(output_csv.encode()).decode() # some strings <-> bytes conversions necessary here
75
+ href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_file.csv">Download cleaned CSV file</a>'
76
+ st.markdown(href, unsafe_allow_html=True)
77
+
78
+ st.write('File cleaned and ready for download.')