Spaces:
Build error
Build error
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import re | |
| from streamlit_extras.dataframe_explorer import dataframe_explorer | |
| import warnings | |
| from sdv.metadata import SingleTableMetadata | |
| from streamlit_extras.stateful_button import button | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.pipeline import Pipeline | |
| from tensorflow.keras.models import Model | |
| from tensorflow.keras.layers import Input, Dense, LSTM, Bidirectional, Conv1D, MaxPooling1D, Flatten, Concatenate, Reshape, RepeatVector | |
| from tensorflow.keras.optimizers import Adam | |
| from tensorflow.keras.losses import MeanSquaredError | |
| from streamlit_extras.stylable_container import stylable_container | |
| from ydata_profiling import ProfileReport | |
| from streamlit_pandas_profiling import st_profile_report | |
| import base64 | |
| from sdv.datasets.local import load_csvs | |
| import pyodbc | |
| import pymssql | |
| from streamlit_app import sidebar | |
| warnings.filterwarnings('ignore') | |
| st.set_page_config( | |
| page_title='Profilify: Your AI Assisted Data Profiling App', | |
| layout='wide', | |
| initial_sidebar_state='collapsed' | |
| ) | |
| st.markdown(""" | |
| <style> | |
| /* Remove blank space at top and bottom */ | |
| .block-container { | |
| padding-top: 2.8rem; | |
| /*padding-bottom: 1rem;*/ | |
| } | |
| /* Remove blank space at the center canvas */ | |
| .st-emotion-cache-z5fcl4 { | |
| position: relative; | |
| top: -62px; | |
| } | |
| /* Make the toolbar transparent and the content below it clickable */ | |
| .st-emotion-cache-18ni7ap { | |
| pointer-events: none; | |
| background: rgb(255 255 255 / 0%) | |
| } | |
| .st-emotion-cache-zq5wmm { | |
| pointer-events: auto; | |
| background: rgb(255 255 255); | |
| border-radius: 5px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def load_dataframe_to_sqlserver(df, table_name, connection_string): | |
| # Establish a connection to the database | |
| conn = pyodbc.connect(connection_string) | |
| cursor = conn.cursor() | |
| # Drop table if it exists | |
| drop_table_sql = f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}" | |
| try: | |
| cursor.execute(drop_table_sql) | |
| conn.commit() | |
| except Exception as e: | |
| st.error(f"Error dropping table. Please try with a different name.") | |
| # Create table SQL statement based on DataFrame columns and types | |
| create_table_sql = f"CREATE TABLE {table_name} (" | |
| for column in df.columns: | |
| dtype = str(df[column].dtype) | |
| sql_dtype = 'NVARCHAR(MAX)' | |
| create_table_sql += f"{column} {sql_dtype}, " | |
| create_table_sql = create_table_sql.rstrip(', ') + ')' | |
| try: | |
| # Execute table creation | |
| cursor.execute(create_table_sql) | |
| conn.commit() | |
| except Exception as e: | |
| st.error(f"Error Creating table. Please try with a different name.") | |
| # Insert DataFrame data into the table using bulk insert | |
| insert_sql = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({', '.join(['?' for _ in df.columns])})" | |
| try: | |
| # Using `fast_executemany` for bulk inserts | |
| cursor.fast_executemany = True | |
| cursor.executemany(insert_sql, df.values.tolist()) | |
| conn.commit() | |
| st.success(f"Data Imported with table name: '{table_name}' successfully.") | |
| except Exception as e: | |
| st.error(f"Error Inserting Data. Please try with a different name.") | |
| cursor.close() | |
| conn.close() | |
| def clear_cache(): | |
| keys = list(st.session_state.keys()) | |
| for key in keys: | |
| st.session_state.pop(key) | |
| def set_bg_hack(main_bg): | |
| ''' | |
| A function to unpack an image from root folder and set as bg. | |
| Returns | |
| ------- | |
| The background. | |
| ''' | |
| # set bg name | |
| main_bg_ext = "png" | |
| st.markdown( | |
| f""" | |
| <style> | |
| .stApp {{ | |
| background: url(data:image/{main_bg_ext};base64,{base64.b64encode(open(main_bg, "rb").read()).decode()}); | |
| background-size: cover | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| #set_bg_hack("bg2.png") | |
| header_style = """ | |
| <style> | |
| .header { | |
| color: black; /* Soft dark gray text color for readability */ | |
| width: 103%; | |
| font-size: 60px; /* Large font size */ | |
| font-weight: bold; /* Bold text */ | |
| line-height: 1.2; /* Improved readability */ | |
| margin-bottom: 30px; /* Add some space below the header */ | |
| padding: 20px; /* Add padding for better spacing */ | |
| background-image: | |
| linear-gradient(to right, rgba(255, 140, 0, 0.3) 25%, transparent 75%), /* Darker orange with higher opacity */ | |
| linear-gradient(to bottom, rgba(255, 140, 0, 0.3) 15%, transparent 75%), | |
| linear-gradient(to left, rgba(255, 140, 0, 0.3) 25%, transparent 55%), | |
| linear-gradient(to top, rgba(255, 140, 0, 0.3) 25%, transparent 95%); | |
| background-blend-mode: overlay; | |
| background-size: 250px 350px; | |
| border-radius: 10px; /* Add border radius for rounded corners */ | |
| box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */ | |
| } | |
| </style> | |
| """ | |
| content_style = """ | |
| <style> | |
| .content { | |
| font-size: 40px; /* Larger font size for content */ | |
| line-height: 1.6; /* Improved readability */ | |
| width: 103%; | |
| padding: 10px; /* Add padding for better spacing */ | |
| margin-bottom: 20px; | |
| background-color: sky-blue; /* Background color for the header */ | |
| border-radius: 10px; /* Add border radius for rounded corners */ | |
| box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */ | |
| } | |
| </style> | |
| """ | |
| small_style = """ | |
| <style> | |
| .small { | |
| color: black; | |
| font-size: 30px; /* Larger font size for content */ | |
| line-height: 1.6; /* Improved readability */ | |
| width: 100%; | |
| padding: 10px; /* Add padding for better spacing */ | |
| margin-bottom: 10px; | |
| background-color: white; /* Background color for the header */ | |
| border-radius: 10px; /* Add border radius for rounded corners */ | |
| } | |
| </style> | |
| """ | |
| def update_column_dtype(df, column_name, dtype): | |
| error_entries = pd.DataFrame() | |
| flag = None | |
| if dtype == 'System Detected': | |
| pass | |
| elif dtype == 'int64': | |
| try: | |
| df[column_name] = df[column_name].astype('int64') | |
| except ValueError: | |
| error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())] | |
| st.error('Unable to convert some entries to integer. Please Clean the column.') | |
| elif dtype == 'float64/numeric': | |
| try: | |
| df[column_name] = df[column_name].astype('float64') | |
| except ValueError: | |
| error_entries = df[pd.to_numeric(df[column_name], errors='coerce').isna()] | |
| st.error('Unable to convert some entries to float. Please Clean the column.') | |
| elif dtype == 'id': | |
| try: | |
| df[column_name] = df[column_name].astype('int64') | |
| except ValueError: | |
| error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())] | |
| st.error('Unable to convert some entries to id. Please Clean the column.') | |
| elif dtype == 'categorical/string': | |
| df[column_name] = df[column_name].astype('category') | |
| elif dtype == 'datetime': | |
| try: | |
| df[column_name] = pd.to_datetime(df[column_name], errors='raise', infer_datetime_format=True) | |
| except ValueError: | |
| error_entries = df[pd.to_datetime(df[column_name], errors='coerce', infer_datetime_format=True).isna()] | |
| custom_format = st.text_input("Please provide the datetime format (e.g., %Y-%m-%d):") | |
| if custom_format: | |
| try: | |
| df[column_name] = pd.to_datetime(df[column_name], errors='raise', format=custom_format) | |
| except ValueError: | |
| error_entries = df[pd.to_datetime(df[column_name], errors='coerce', format=custom_format).isna()] | |
| st.error('Unable to parse datetime with the provided format. Please Clean the column.') | |
| elif dtype == 'email': | |
| df[column_name] = df[column_name].astype('category') | |
| flag= 'email' | |
| elif dtype == 'phone_number': | |
| df[column_name] = df[column_name].astype('category') | |
| flag= 'phone_number' | |
| return df, error_entries, flag | |
| def convert_to_special_representation(value): | |
| value = str(value) | |
| special_chars = set("!@#$%^&*()_+-=[]{}|;:,.<>?`~") | |
| result = '' | |
| for char in value: | |
| if char.isdigit(): | |
| result += 'N' | |
| elif char.isalpha(): | |
| result += 'A' | |
| elif char in special_chars: | |
| result += char | |
| else: | |
| # Handle other characters as needed | |
| result += char | |
| return result | |
| ###### | |
| def main(): | |
| # st.title('PAGE TITLE') # Change this for each page | |
| sidebar() | |
| ######## | |
| with st.container(border=True): | |
| st.subheader('SELECT TABLE') | |
| metadata = SingleTableMetadata() | |
| # conn = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;" | |
| # "Database=sqldb-ext-dev-uks-001;" | |
| # "UID=dbadmin;" | |
| # "PWD=mYpa$$w0rD" ) | |
| conn = pymssql.connect(server="sql-ext-dev-uks-001.database.windows.net", | |
| database="sqldb-ext-dev-uks-001", | |
| user="dbadmin", | |
| password="mYpa$$w0rD" ) | |
| query1_1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME in ('TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM') ORDER BY TABLE_NAME ASC" | |
| query1_2="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME LIKE 'PROFILED%' ORDER BY TABLE_NAME ASC" | |
| tab_names=list(pd.read_sql_query(query1_1,con=conn)['TABLE_NAME']) | |
| tab_names_edited= list(pd.read_sql_query(query1_2,con=conn)['TABLE_NAME']) | |
| sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache) | |
| mode_selector=st.selectbox("Select How you want to Proceed", ["Start Profiling with Source Data", "Load Previously Profiled Data For Further Processing"], on_change=clear_cache,placeholder='Show Options') | |
| if mode_selector == "Start Profiling with Source Data": | |
| table_selector=st.selectbox('SELECT TABLE NAME',tab_names,index=None,on_change=clear_cache,placeholder='Select table name') | |
| if mode_selector == "Load Previously Profiled Data For Further Processing": | |
| table_selector=st.selectbox('SELECT TABLE NAME',tab_names_edited,index=None,on_change=clear_cache,placeholder='Select table name') | |
| if table_selector is not None and sample_selector is not None: | |
| if sample_selector=='100': | |
| count="top 100" | |
| elif sample_selector=='10K': | |
| count="top 10000" | |
| elif sample_selector=='100K': | |
| count="top 100000" | |
| elif sample_selector=='1M': | |
| count="top 1000000" | |
| else: | |
| count="" | |
| query2="select "+count+" * from [dbo].["+table_selector+"]" | |
| df = pd.read_sql_query(query2,con=conn) | |
| main_list=df.columns.to_list() | |
| sub_list=['ID','LOADID','FILE_NAME'] | |
| if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): | |
| df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) | |
| conn.close() | |
| if 'data' not in st.session_state: | |
| st.session_state.data= df | |
| metadata.detect_from_dataframe(st.session_state.data) | |
| st.sidebar.header("DataFrame Live Preview") | |
| st.sidebar.markdown("*This Window keeps the live status of the dataframe under processing. You can review this dataframe after all the changes.*") | |
| df_preview= st.sidebar.empty() | |
| df_preview.write(st.session_state.data) | |
| st.markdown(content_style, unsafe_allow_html=True) | |
| with st.container(border=True): | |
| cols= df.columns.to_list() | |
| primary_key= metadata.primary_key | |
| sugg_primary_keys = [col for col in cols if df[col].is_unique and df[col].dtype != 'float' and not df[col].isnull().any()] | |
| prob_key= sugg_primary_keys | |
| if primary_key in sugg_primary_keys: | |
| default_index = sugg_primary_keys.index(primary_key) | |
| else: | |
| sugg_primary_keys.append(primary_key) | |
| default_index = sugg_primary_keys.index(primary_key) | |
| no_y_data =[] | |
| email_cols=[] | |
| phone_cols=[] | |
| # cols_select= st.multiselect('Please select column(s) for Profiling and Cleansing', cols, default= cols[:5]) | |
| tabs3= st.tabs(cols) | |
| for i, tab in enumerate(tabs3): | |
| with tab: | |
| col= cols[i] | |
| scol1,scol2= st.columns([4,1]) | |
| with scol1: | |
| taba, tabb, tabc, tabd, tabe = st.tabs(["📝 DataType Validation", "🧹 Missing Value Handling", "📈 Statistical Profiling", " ✨ Pattern Exploration", "🤖 AI Assisted Data Cleansing"]) | |
| with taba: | |
| if st.session_state.data[col].dtype.name == 'category': | |
| st.session_state.data[col] = st.session_state.data[col].astype('str') | |
| dtypes= ['System Detected', 'int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number'] | |
| no_dtypes= ['int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number'] | |
| no_dtype = False | |
| if metadata.columns[col]['sdtype'] != "unknown": | |
| datatype= metadata.columns[col]['sdtype'] | |
| st.info(f"System Identified DataType: {datatype}") | |
| elif str(df[col].dtype) != 'object' and metadata.columns[col]['sdtype'] == "unknown": | |
| datatype= str(df[col].dtype) | |
| st.info(f"System Identified DataType: {datatype}") | |
| else: | |
| datatype= 'NA' | |
| #st.warning("System Could Not Understand Datatype. Please Specify the Datatype") | |
| no_dtype= True | |
| if datatype in ['int64']: | |
| def_index=1 | |
| if datatype in ['float64', 'numerical']: | |
| def_index=2 | |
| if datatype in ['id']: | |
| def_index=3 | |
| if datatype in ['categorical', 'string']: | |
| def_index=4 | |
| if datatype in ['datetime']: | |
| def_index=5 | |
| if datatype in ['email']: | |
| def_index=6 | |
| if datatype in ['phone_number']: | |
| def_index=7 | |
| if col == primary_key: | |
| st.success("This is System Identified Primary Key") | |
| elif col in prob_key: | |
| st.warning("This is System suggested potential Primary Key") | |
| if f'dtype_{col}' not in st.session_state: | |
| st.session_state[f'dtype_{col}'] = 'initiate' | |
| if st.session_state[f'dtype_{col}'] not in ['email', 'phone_number']: | |
| st.session_state.flag = None | |
| if no_dtype == True: | |
| fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",no_dtypes, index=3, key= f'datatype_{col}') | |
| else: | |
| fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",dtypes, index=def_index, key= f'datatype_{col}') | |
| st.session_state[f'dtype_{col}'] = st.session_state[f'datatype_{col}'] | |
| st.session_state.data, error_df, st.session_state.flag= update_column_dtype(st.session_state.data,col,fin_datatype) | |
| if error_df.empty: | |
| st.success("No Datatype Validation Errors For Current Datatype") | |
| try: | |
| df_preview.write(st.session_state.data) | |
| except: | |
| st.warning("DataFrame Updated. But Could Not Load Preview") | |
| else: | |
| st.subheader("Prepare the Column for Conversion:") | |
| try: | |
| edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={ | |
| col: st.column_config.TextColumn( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'dtype_error_{col}') | |
| except: | |
| edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={ | |
| col: st.column_config.TextColumn( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'dtype_error_{col}') | |
| check = st.button("Fix Error", key=f"Fix{col}") | |
| if check: | |
| st.session_state.data= st.session_state.data.drop(error_df.index) | |
| st.session_state.data = pd.concat([st.session_state.data, edited_error_df]) | |
| df_preview.write(st.session_state.data) | |
| if fin_datatype in ['id', 'email', 'phone_number']: | |
| no_y_data.append(col) | |
| if fin_datatype in ['email']: | |
| email_cols.append(col) | |
| if fin_datatype in ['phone_number']: | |
| phone_cols.append(col) | |
| no_y_data.extend(['Validity','Validity_phone','Validity_email']) | |
| total_records = len(st.session_state.data) | |
| with tabc: | |
| if col not in no_y_data: | |
| y_data_col= st.session_state.data[[col]] | |
| pr = ProfileReport(y_data_col, dark_mode=True, explorative=False, config_file=r"ydata_config.yml") | |
| pr.config.html.style.primary_colors = ['#e41a1c'] | |
| with st.container(border=True): | |
| st_profile_report(pr, navbar=False, key=f'profile{col}') | |
| elif col in email_cols: | |
| unique_emails = st.session_state.data[col].nunique() | |
| duplicate_emails = total_records - unique_emails | |
| # Extract email domains | |
| email_domains = st.session_state.data[col].str.extract(r'@(.+)$')[0] | |
| # Count occurrences of each domain | |
| email_domain_counts = email_domains.value_counts() | |
| # Get the top 5 email domains | |
| top_email_domains = email_domain_counts.head(5) | |
| # Format the top email domains for display | |
| top_email_domains_str = '\n|\n'.join([f"{domain}: {count}" for domain, count in top_email_domains.items()]) | |
| if f'invalid_em_{col}' in st.session_state: | |
| invalid_emails= len(st.session_state[f'invalid_em_{col}']) | |
| valid_emails= total_records - invalid_emails | |
| percent_invalid_emails = invalid_emails / total_records * 100 | |
| email_message = f""" | |
| ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} ({100 - percent_invalid_emails:.2f}%)\n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails} ({percent_invalid_emails:.2f}%)\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str} | |
| """ | |
| else: | |
| invalid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." | |
| valid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." | |
| percent_invalid_emails = "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." | |
| email_message = f""" | |
| ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} \n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails}\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str} | |
| """ | |
| with st.container(border=True): | |
| st.markdown(str(email_message)) | |
| ref_em=st.button('Refresh', key=f'email{col}') | |
| if ref_em: | |
| pass | |
| elif col in phone_cols: | |
| unique_phones = st.session_state.data[col].nunique() | |
| duplicate_phones = total_records - unique_phones | |
| phone_country_codes = st.session_state.data[col].str.extract(r'^\+(\d+)')[0].value_counts() | |
| top_phone_country_codes = list(phone_country_codes.head(5).to_string()) | |
| to_remove = ['\n', ' '] | |
| top_phone_country_codes = [item for item in top_phone_country_codes if item not in to_remove] | |
| if f'invalid_ph_{col}' in st.session_state: | |
| invalid_phones= len(st.session_state[f'invalid_ph_{col}']) | |
| valid_phones= total_records - invalid_phones | |
| percent_invalid_phones = invalid_phones / total_records * 100 | |
| phone_message= f""" | |
| ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} ({100 - percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Invalid Phone Numbers:** {invalid_phones} ({percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Top 5 Phone Country Codes:** {top_phone_country_codes} | |
| """ | |
| else: | |
| invalid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." | |
| valid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." | |
| percent_invalid_phones = "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." | |
| phone_message=f""" | |
| ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Invalid Phone Numbers:** {invalid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Top 5 Phone Country Codes:** {top_phone_country_codes} | |
| """ | |
| with st.container(border=True): | |
| st.markdown(str(phone_message)) | |
| ref_ph=st.button('Refresh', key=f'phone{col}') | |
| if ref_ph: | |
| pass | |
| with tabd: | |
| st.session_state.data_encoded = st.session_state.data.copy() | |
| st.session_state.data_encoded[f'Pattern_{col}'] = st.session_state.data_encoded[col].apply(convert_to_special_representation) | |
| patterns= list(st.session_state.data_encoded[f'Pattern_{col}'].unique()) | |
| patt_col1, patt_col2 = st.columns([1,4]) | |
| with patt_col1: | |
| st.session_state.pattern_list= pd.DataFrame(patterns,columns=['Pattern Name']) | |
| event = st.dataframe( | |
| st.session_state.pattern_list, | |
| key=f"pattern_list_data{col}", | |
| on_select="rerun", | |
| selection_mode=["multi-row"], | |
| hide_index=True, | |
| width= 10000, | |
| height= 450 | |
| ) | |
| if len(event.selection.rows) > 0: | |
| filter= list(st.session_state.pattern_list.loc[event.selection.rows]['Pattern Name'].values) | |
| else: | |
| filter = None | |
| if filter is not None: | |
| with patt_col2: | |
| with st.container(border= True, height= 450): | |
| st.write("#####") | |
| if not st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].empty: | |
| st.session_state.data_encoded[col] = st.session_state.data_encoded[col].astype('str') | |
| try: | |
| edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={ | |
| col: st.column_config.TextColumn( | |
| col, | |
| width="medium", | |
| ) | |
| }, height=300, key=f'Valid_pattern_{col}') | |
| except: | |
| edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={ | |
| col: st.column_config.Column( | |
| col, | |
| width="medium", | |
| ) | |
| }, height=300, key=f'Valid_pattern_{col}') | |
| valid_pattern = st.button("Confirm", key=f"Fix_valid_pattern_{col}") | |
| if valid_pattern: | |
| st.session_state.data= st.session_state.data.drop(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].index) | |
| st.session_state.data = pd.concat([st.session_state.data, edited_pattern_df]) | |
| st.session_state.data=st.session_state.data.drop([f'Pattern_{col}'], axis=1) | |
| st.session_state.data= st.session_state.data.sort_index() | |
| df_preview.write(st.session_state.data) | |
| else: | |
| with patt_col2: | |
| with stylable_container( | |
| key=f"container_select_pattern_none{col}", | |
| css_styles=""" | |
| { | |
| border: 1px solid white; | |
| border-radius: 0.5rem; | |
| padding: calc(1em - 1px); | |
| width: 100%; | |
| color: orange; | |
| size: 100px; | |
| } | |
| """ | |
| ): | |
| st.write('##\n\n##\n\n') | |
| st.markdown(""" | |
| <style> | |
| .big-font { | |
| font-size:15px; | |
| width: 100%; | |
| text-align: center; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.markdown(f'<p class="big-font">🛈 There are total {len(st.session_state.pattern_list)} Number of Patterns Available. Please Select Pattern(s) for Matching Records</p>', unsafe_allow_html=True) | |
| st.write('##\n\n##\n\n') | |
| with tabb: | |
| try: | |
| edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={ | |
| col: st.column_config.TextColumn( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'miss_{col}') | |
| except: | |
| edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={ | |
| col: st.column_config.Column( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'miss_{col}') | |
| incol1,incol2, extra= st.columns([1.1,1.5,8]) | |
| with incol1: | |
| #st.write(st.session_state[f'dtype_{col}']) | |
| if st.session_state[f'dtype_{col}'] not in ['int64', 'float64/numeric']: | |
| def_fill = st.text_input("Default Autofill Value",key=f"def_fill_{col}") | |
| autofill= st.button("Autofill", key=f"autofill_{col}") | |
| if autofill: | |
| if st.session_state[f'dtype_{col}'] not in ['int','float']: | |
| st.session_state.data[col] = st.session_state.data[col].astype('str').replace('', pd.NA).replace({None: pd.NA}).fillna(def_fill) | |
| else: | |
| st.session_state.data[col] = st.session_state.data[col].replace({None: pd.NA}).fillna(method='ffill') | |
| st.success("Column Autofilled. Please Review the Sidebar for updated status of the Dataframe.") | |
| df_preview.write(st.session_state.data) | |
| with incol2: | |
| confirm= st.button("Confirm", key=f"Confirm_{col}") | |
| if confirm: | |
| st.session_state.data[col] = st.session_state.data[col].replace('', np.nan).replace({None: np.nan}) | |
| st.session_state.data = st.session_state.data.dropna(subset=[col]) | |
| st.session_state.data.update(edited_df) | |
| st.session_state.data = pd.concat([st.session_state.data, edited_df[~edited_df.index.isin(st.session_state.data.index)]]) | |
| st.session_state.data= st.session_state.data.sort_index() | |
| st.success("State Saved. Please Review the Sidebar for updated status of the Dataframe.") | |
| df_preview.write(st.session_state.data) | |
| with tabe: | |
| if "overall_invalid_df" not in st.session_state: | |
| st.session_state.overall_invalid_df = pd.DataFrame() | |
| if (st.session_state[f'dtype_{col}'] not in ['email', 'phone_number'] and st.session_state.flag not in ['email', 'phone_number']): | |
| st.dataframe(st.session_state.data) | |
| AI_check= st.button("Check For Anomalies", key= f'AI_CHECK_{col}') | |
| if AI_check: | |
| with st.spinner("Running Anomaly Detection AI"): | |
| #my_bar = st.progress(0, text="Progress") | |
| if st.session_state[f'dtype_{col}'] in ['categorical/string']: | |
| if 'missing@123' not in st.session_state.data[col].cat.categories: | |
| st.session_state.data[col] = st.session_state.data[col].cat.add_categories(['missing@123']) | |
| st.session_state.data[col] = st.session_state.data[col].fillna('missing@123').astype(str) | |
| st.session_state.data_encoded = st.session_state.data[col].apply(convert_to_special_representation) | |
| mixed_transformer = Pipeline(steps=[ | |
| ('vectorizer', CountVectorizer(analyzer='char', lowercase=False)) | |
| ]) | |
| df_transformed = mixed_transformer.fit_transform(st.session_state.data_encoded) | |
| input_dim = df_transformed.shape[1] | |
| encoding_dim = (input_dim // 2) + 1 | |
| input_layer = Input(shape=(None, input_dim)) | |
| conv1d_layer = Conv1D(64, 3, activation='relu', padding='same')(input_layer) | |
| maxpooling_layer = MaxPooling1D(pool_size=2, padding='same')(conv1d_layer) | |
| encoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=False))(maxpooling_layer) | |
| repeat_vector = RepeatVector(input_dim)(encoder_lstm) | |
| decoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=True))(repeat_vector) | |
| conv1d_layer_decoder = Conv1D(64, 3, activation='relu', padding='same')(decoder_lstm) | |
| upsampling_layer = Conv1D(input_dim, 2, activation='relu', padding='same')(conv1d_layer_decoder) | |
| autoencoder = Model(inputs=input_layer, outputs=upsampling_layer) | |
| autoencoder.compile(optimizer=Adam(), loss=MeanSquaredError()) | |
| #my_bar.progress(40, text='Progress') | |
| autoencoder.fit(np.expand_dims(df_transformed.toarray(), axis=1), np.expand_dims(df_transformed.toarray(), axis=1), | |
| epochs=100, batch_size=2, shuffle=True, validation_split=0.2, verbose=1) | |
| reconstructions = autoencoder.predict(np.expand_dims(df_transformed.toarray(), axis=1)) | |
| reconstruction_error = np.mean(np.abs(reconstructions - np.expand_dims(df_transformed.toarray(), axis=1)), axis=(1, 2)) | |
| threshold = np.percentile(reconstruction_error, 95) # Adjust the percentile based on desired sensitivity | |
| #my_bar.progress(90, text='Progress') | |
| st.session_state.data['Validity'] = ['Invalid' if error > threshold else 'Valid' for error in reconstruction_error] | |
| st.session_state.data[col] = st.session_state.data[col].replace('missing@123', '') | |
| st.session_state[f"invalid_ai_data_{col}"]= st.session_state.data[st.session_state.data['Validity']== 'Invalid'] | |
| #my_bar.progress(100, text='Progress') | |
| if f"invalid_ai_data_{col}" in st.session_state: | |
| st.session_state[f"invalid_ai_data_{col}"]["Invalid Field"] = col | |
| if 'Validity' in st.session_state[f"invalid_ai_data_{col}"].columns: | |
| st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"].drop(['Validity'], axis=1)], ignore_index=True) | |
| else: | |
| st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"]], ignore_index=True) | |
| try: | |
| edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={ | |
| col: st.column_config.TextColumn( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'Valid_{col}') | |
| except: | |
| edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={ | |
| col: st.column_config.Column( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'Valid_{col}') | |
| valid = st.button("Confirm", key=f"Fix_valid_{col}") | |
| #my_bar.empty() | |
| if valid: | |
| st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity'] == 'Invalid'].index) | |
| st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) | |
| st.session_state.data= st.session_state.data.sort_index() | |
| df_preview.write(st.session_state.data) | |
| elif (st.session_state[f'dtype_{col}'] in ['phone_number'] or st.session_state.flag in ['phone_number'] ): | |
| #st.dataframe(st.session_state.data) | |
| phone_regex = r'^\+?[0-9\s\-\(\)]+$' | |
| # st.write(phone_regex) | |
| st.session_state.data['Validity_phone'] = st.session_state.data[col].apply(lambda xy: 'phone_is_valid' if re.match(phone_regex,str(xy)) else 'phone_is_invalid') | |
| st.session_state[f'invalid_phone_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1) | |
| if f'invalid_phone_{col}_check' not in st.session_state: | |
| st.session_state[f'invalid_phone_{col}']["Invalid Field"] = col | |
| st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_phone_{col}']], ignore_index=True, axis=0) | |
| st.session_state[f'invalid_phone_{col}_check'] = 'yes' | |
| try: | |
| edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={ | |
| col: st.column_config.TextColumn( | |
| col, | |
| width="medium", | |
| ) | |
| }, num_rows="dynamic", key=f'Valid_phone_{col}') | |
| except: | |
| edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={ | |
| col: st.column_config.Column( | |
| col, | |
| width="medium", | |
| ) | |
| }, num_rows="dynamic", key=f'Valid_phone_{col}') | |
| valid_phone = st.button("Confirm", key=f"Fix_valid_phone_{col}") | |
| if valid_phone: | |
| st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].index) | |
| st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) | |
| st.session_state[f'invalid_ph_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1) | |
| st.session_state.data = st.session_state.data.drop(['Validity_phone'], axis=1) | |
| df_preview.write(st.session_state.data) | |
| elif (st.session_state[f'dtype_{col}'] in ['email'] or st.session_state.flag in ['email']): | |
| email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$' | |
| st.session_state.data['Validity_email'] = st.session_state.data[col].apply(lambda x: 'email_is_valid' if re.match(email_regex, x) else 'email_is_invalid') | |
| if st.session_state.data[col].dtype.name == 'category': | |
| st.session_state.data[col] = st.session_state.data[col].astype('str') | |
| st.session_state[f'invalid_email_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1) | |
| if f'invalid_email_{col}_check' not in st.session_state: | |
| st.session_state[f'invalid_email_{col}']["Invalid Field"] = col | |
| st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_email_{col}']], ignore_index=True, axis=0) | |
| st.session_state[f'invalid_email_{col}_check'] = 'yes' | |
| try: | |
| edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={ | |
| col: st.column_config.TextColumn( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'Valid_email_{col}') | |
| except: | |
| edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={ | |
| col: st.column_config.Column( | |
| col, | |
| width="medium", | |
| ) | |
| }, key=f'Valid_email_{col}') | |
| valid_email = st.button("Confirm", key=f"Fix_valid_email_{col}") | |
| if valid_email: | |
| st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].index) | |
| st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) | |
| st.session_state[f'invalid_em_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1) | |
| st.session_state.data = st.session_state.data.drop(['Validity_email'], axis=1) | |
| df_preview.write(st.session_state.data) | |
| with scol2: | |
| st.markdown("**Column Being Processed**") | |
| col_view= st.empty() | |
| try: | |
| col_view.write(st.session_state.data[col]) | |
| except: | |
| st.warning("DataFrame Updated. But Could Not Load Preview") | |
| pkcol1, pkcol2=st.columns(2) | |
| with pkcol1: | |
| if primary_key != None: | |
| st.info(f"Primary Key Identified by AI: {primary_key}") | |
| else: | |
| st.warning("Could Not Finalize the Primary Key Automatically. Please go through the suggestions and Finalize one.") | |
| with pkcol2: | |
| st.selectbox("Please Finalize the Primary Key:", sugg_primary_keys, index= default_index) | |
| with st.expander("Save and Download Data"): | |
| name_data= st.text_input("Please Specify Name of the saved/downloaded data") | |
| csv = st.session_state.data.to_csv(index=False).encode('utf-8') | |
| for col in ['Validity', 'Validity_email', 'Validity_phone']: | |
| if col in st.session_state.overall_invalid_df: | |
| st.session_state.overall_invalid_df = st.session_state.overall_invalid_df.drop([col], axis=1) | |
| csv2 = st.session_state.overall_invalid_df.to_csv(index=False).encode('utf-8') | |
| #st.write(st.session_state.overall_invalid_df) | |
| # Create a download button | |
| dldcol1, dldcol2= st.columns([1,4]) | |
| with dldcol1: | |
| st.download_button( | |
| label="Download Cleaned Data as CSV", | |
| data=csv, | |
| file_name=f'{name_data}.csv', | |
| mime='text/csv', | |
| ) | |
| with dldcol2: | |
| st.download_button( | |
| label="Download Anomalous Data as CSV", | |
| data=csv2, | |
| file_name=f'Anomaly_{name_data}.csv', | |
| mime='text/csv', | |
| ) | |
| save = st.button("Save Data For Further Processing") | |
| if save: | |
| connection_string = ( 'SERVER=sql-ext-dev-uks-001.database.windows.net;' | |
| 'DATABASE=sqldb-ext-dev-uks-001;' | |
| 'UID=dbadmin;' | |
| 'PWD=mYpa$$w0rD' | |
| ) | |
| st.session_state.data = st.session_state.data.astype(str) | |
| load_dataframe_to_sqlserver(st.session_state.data, f'[dbo].[PROFILED_{name_data}]', connection_string) | |
| ###### | |
| if __name__ == '__main__': | |
| main() |