Spaces:
Build error
Build error
| from logging import PlaceHolder | |
| import pandas as pd | |
| from fuzzywuzzy import fuzz | |
| import numpy as np | |
| import streamlit as st | |
| import pyodbc | |
| from streamlit_extras.stateful_button import button | |
| import pymssql | |
| ############ | |
| from streamlit_app import sidebar | |
| st.set_page_config(page_title='DUPLICATE RECORDS DETECTION', layout= 'wide') | |
| ###### | |
| def main(): | |
| # st.title('PAGE TITLE') # Change this for each page | |
| sidebar() | |
| ######## | |
| st.title('Detect Duplicate Records') | |
| st.subheader('SELECT TABLE') | |
| conn = pymssql.connect( server="sql-ext-dev-uks-001.database.windows.net", | |
| database="sqldb-ext-dev-uks-001", | |
| user="dbadmin", | |
| password="mYpa$$w0rD" ) | |
| # conn = pymssql.connect( "Server=sql-ext-dev-uks-001.database.windows.net;" | |
| # "Database=sqldb-ext-dev-uks-001;" | |
| # "UID=dbadmin;" | |
| # "PWD=mYpa$$w0rD" ) | |
| query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC" | |
| table1=pd.read_sql_query(query1,con=conn) | |
| table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str') | |
| table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping') | |
| btn11=button('RUN',key='run11') | |
| if table_selector is not None and btn11: | |
| st.markdown('---') | |
| query2="select * from [dbo].["+table_selector+"]" | |
| df = pd.read_sql_query(query2,con=conn) | |
| conn.close() | |
| st.subheader('Data Preview') | |
| data1=df.copy() | |
| if set(['ID','LOADID','FILE_NAME']).issubset(df.columns): | |
| df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) | |
| df = df.replace(r'^\s*$', np.nan, regex=True) | |
| if 'SORTL' in df.columns.values.tolist(): | |
| df.drop('SORTL',axis=1,inplace=True) | |
| main_col=st.multiselect('PLEASE PROVIDE CONTEXT FOR DEDUPLICATION',df.columns.values.tolist(),placeholder='Select entity for deduplication') | |
| if main_col: | |
| mp = df.isnull().mean()*100 ## Missing Percentage | |
| col = mp[mp<20].index.tolist() | |
| print(col) | |
| up = df[col].apply(lambda x: len(x.unique())/len(x)*100) ## Unique Percentage | |
| up.sort_values(ascending=False,inplace=True) | |
| col = up[(up>=25)&(up<=75)].index.tolist() | |
| df=df.replace(np.nan,'') | |
| if len(main_col)>1: | |
| if bool(set(col)&set(main_col)): | |
| col=list(set(col)-set(main_col)) | |
| df['main_column']='' | |
| df['main_column']=df['main_column'].astype(str) | |
| st.markdown('---') | |
| st.write('Note: Main_column comprises of concatenated data of above selected context columns') | |
| for i,val in enumerate(main_col): | |
| df[main_col[i]]=df[main_col[i]].astype(str) | |
| df['main_column']=df['main_column']+'_'+df[main_col[i]] | |
| col.insert(0,'main_column') | |
| rem_col=list(set(df.columns.values.tolist())-set(col)) | |
| else: | |
| if main_col[0] in col: | |
| col.remove(main_col[0]) | |
| col.insert(0,main_col[0]) | |
| rem_col=list(set(df.columns.values.tolist())-set(col)) | |
| st.write('COLUMNS SUGGESTED BY AI FOR DETERMINING DUPLICATES:\n',pd.DataFrame(col,columns=['Column Name'])) | |
| more_col=st.multiselect('DO YOU WANT TO INCLUDE ANY MORE COLUMN(s)',rem_col,placeholder='Select optional columns to check for potential duplicates') | |
| button1=button('CHECK DUPLICATES', key='btn12') | |
| if button1: | |
| if more_col: | |
| col=col+more_col | |
| grp_col = [x+'_based_group' for x in col] | |
| sort_col=[] | |
| last_row_index = len(df)-1 | |
| print(col) | |
| threshold=80 ## Threshold is set to 80 | |
| for j in range(len(col)): | |
| df[col[j]]=df[col[j]].astype(str) | |
| df[col[j]]=df[col[j]].str.upper() | |
| df[col[j]] = df[col[j]].replace(np.nan, '', regex=True) | |
| sort_col = sort_col+[col[j]] | |
| df.sort_values(sort_col, inplace=True) | |
| df = df.reset_index(drop=True) | |
| fuzz_col = col[j]+'_fuzzy_ratio' | |
| df.at[0,fuzz_col]=100 | |
| df.at[last_row_index,fuzz_col]=100 | |
| for i in range(1,last_row_index): | |
| current = df[col[j]].iloc[i] | |
| previous = df[col[j]].iloc[i-1] | |
| fuzzy_ratio = fuzz.ratio(previous,current) | |
| df.at[i,fuzz_col] = fuzzy_ratio | |
| df[fuzz_col] = pd.to_numeric(df[fuzz_col], errors='coerce') | |
| group_counter = 1 | |
| fuzz_group = col[j]+'_based_group' | |
| df.at[0,fuzz_group] = group_counter | |
| group = df.at[0,fuzz_group] | |
| for i in range (1, len(df)): | |
| if df.at[i,fuzz_col] > threshold: | |
| df.at[i,fuzz_group] = df.at[i-1,fuzz_group] | |
| else: | |
| if j>=1: | |
| if df.at[i,col[j-1]+'_fuzzy_ratio'] != group: | |
| group_counter = 1 | |
| group = df.at[i,col[j-1]+'_based_group'] | |
| else: | |
| group_counter +=1 | |
| else: | |
| group_counter += 1 | |
| df.at[i,fuzz_group] = group_counter | |
| #threshold=threshold*0.9 | |
| df['Potential_Duplicate_Cluster'] = df[grp_col].astype(int).astype(str).agg('_'.join, axis=1) | |
| df['DUPLICATES?']=df.duplicated(subset='Potential_Duplicate_Cluster',keep=False).map({True:'Y',False:'N'}) | |
| df = df[df['DUPLICATES?']=='Y'] | |
| # Define a function to apply the style | |
| def highlight_y(val): | |
| color = 'lightcoral' if val=='Y' else 'white' | |
| return f'background-color: {color}' | |
| # Apply styles | |
| styled_df = df.style.applymap(highlight_y,subset=['DUPLICATES?']) | |
| st.markdown('---') | |
| st.subheader('Results') | |
| #st.write(styled_df) | |
| edited_df=st.data_editor(styled_df,num_rows='dynamic') | |
| #out=df.to_csv() | |
| #st.download_button(label='DOWNLOAD DUPLICATE CLUSTER DATA',data=out, file_name='Duplicate_Clusters.csv',mime='text/csv') | |
| ###### | |
| if __name__ == '__main__': | |
| main() | |