Spaces:
Build error
Build error
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import networkx as nx | |
| import numpy as np | |
| import streamlit as st | |
| import sdv | |
| from sdv.datasets.local import load_csvs | |
| from sdv.metadata import MultiTableMetadata | |
| from sdv.multi_table import HMASynthesizer | |
| import time | |
| import os | |
| import gc | |
| import warnings | |
| from PIL import Image | |
| from sdv.metadata import SingleTableMetadata | |
| import pyodbc | |
| import google.generativeai as genai | |
| from google.generativeai.types import HarmCategory, HarmBlockThreshold | |
| import textwrap | |
| from streamlit_extras.stylable_container import stylable_container | |
| from streamlit_extras.stateful_button import button | |
| import json | |
| from io import BytesIO | |
| import pymssql | |
| genai.configure(api_key='AIzaSyCeY8jSHKW6t0OSDRjc2VAfBvMunVrff2w') | |
| genai_mod = genai.GenerativeModel( | |
| model_name='models/gemini-pro' | |
| ) | |
| st.set_page_config(page_title='DATA DISCOVERY', layout= 'wide') | |
| st.markdown(""" | |
| <style> | |
| /* Remove blank space at top and bottom */ | |
| .block-container { | |
| padding-top: 2rem; | |
| } | |
| /* Remove blank space at the center canvas */ | |
| .st-emotion-cache-z5fcl4 { | |
| position: relative; | |
| top: -62px; | |
| } | |
| /* Make the toolbar transparent and the content below it clickable */ | |
| .st-emotion-cache-18ni7ap { | |
| pointer-events: none; | |
| background: rgb(255 255 255 / 0%) | |
| } | |
| .st-emotion-cache-zq5wmm { | |
| pointer-events: auto; | |
| background: rgb(255 255 255); | |
| border-radius: 5px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def clear_cache(): | |
| if 'rdf' in st.session_state: | |
| st.session_state.pop('rdf') | |
| def create_er_diagram(df): | |
| G = nx.DiGraph() # Directed graph | |
| # Dictionary to hold table columns | |
| table_columns = {} | |
| # Add nodes and edges to the graph | |
| for _, row in df.iterrows(): | |
| parent_table = row['PARENT TABLE'] | |
| child_table = row['CHILD TABLE'] | |
| parent_pk = row['PARENT TABLE RELATIONSHIP COLUMN'] | |
| child_fk = row['CHILD TABLE RELATIONSHIP COLUMN'] | |
| cardinality = row.get('CARDINALITY', '1:N') | |
| # Add columns to tables | |
| if parent_table not in table_columns: | |
| table_columns[parent_table] = [] | |
| table_columns[parent_table].append(parent_pk) | |
| if child_table not in table_columns: | |
| table_columns[child_table] = [] | |
| table_columns[child_table].append(child_fk) | |
| # Add nodes and edges | |
| G.add_node(parent_table) | |
| G.add_node(child_table) | |
| G.add_edge(parent_table, child_table, label=f'{parent_pk} -> {child_fk}\n{cardinality}') | |
| return G, table_columns | |
| def draw_er_diagram(G, table_columns): | |
| pos = nx.spring_layout(G, k=1.5, iterations=50) # Use a layout that spreads out nodes | |
| plt.figure(figsize=(8, 8)) | |
| nx.draw(G, pos, with_labels=False, node_size=2500, node_color='lightblue', edge_color='gray', font_size=8, font_weight='bold', arrows=True) | |
| # Draw node labels (table names in bold) | |
| for node, (x, y) in pos.items(): | |
| plt.text(x, y + 0.13, node, fontsize=7, fontweight='bold', ha='center', va='center') | |
| # Draw column names | |
| for node, columns in table_columns.items(): | |
| x, y = pos[node] | |
| column_text = '\n'.join(columns) | |
| plt.text(x, y, column_text, fontsize=6, ha='center', va='center') | |
| # Draw edge labels | |
| edge_labels = nx.get_edge_attributes(G, 'label') | |
| nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6) | |
| st.subheader("Schematic Representation") | |
| with st.container(border=True, height= 350): | |
| st.pyplot(plt) | |
| img_bytes = BytesIO() | |
| plt.savefig(img_bytes, format='png') | |
| img_bytes.seek(0) | |
| return img_bytes | |
| def cardinality(parent_df, child_df, parent_column, child_column): | |
| # Check uniqueness of parent primary key | |
| is_parent_unique = parent_df[parent_column].is_unique | |
| # Check uniqueness of child foreign key | |
| is_child_unique = child_df[child_column].is_unique | |
| # Determine cardinality | |
| if is_parent_unique and is_child_unique: | |
| return '1:1' | |
| elif is_parent_unique and not is_child_unique: | |
| return '1:N' | |
| elif not is_parent_unique and is_child_unique: | |
| return 'N:1' | |
| else: | |
| return 'N:N' | |
| #st.title('AUTOMATED DATA CATALOGUE') | |
| st.subheader('SELECT SOURCE') | |
| selectcol11, selectcol12 = st.columns(2) | |
| with selectcol11: | |
| select1=st.selectbox('SOURCE DB NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name', on_change=clear_cache) | |
| with selectcol12: | |
| select2=st.selectbox('SOURCE SCHEMA NAME',('DBO','CLIENT'),key='SCHname',index=None,placeholder='Select schema name', on_change=clear_cache) | |
| if select1 =='DB_10001' and select2 is not None: | |
| with st.spinner("Loading Tables:"): | |
| conn1 = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;" | |
| "Database=sqldb-ext-dev-uks-001;" | |
| "UID=dbadmin;" | |
| "PWD=mYpa$$w0rD" ) | |
| query0_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' ORDER BY TABLE_NAME ASC" | |
| st.session_state.tab_names_init=list(pd.read_sql_query(query0_1,con=conn1)['TABLE_NAME']) | |
| table_selector=st.multiselect('SOURCE TABLE NAME',st.session_state.tab_names_init,default=None,placeholder='Select table(s) for automated data cataloging', on_change= clear_cache) | |
| sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache) | |
| discover= button("Discover", key='discover') | |
| if discover: | |
| if sample_selector=='100': | |
| count="top 100" | |
| elif sample_selector=='10K': | |
| count="top 10000" | |
| elif sample_selector=='100K': | |
| count="top 100000" | |
| elif sample_selector=='1M': | |
| count="top 1000000" | |
| else: | |
| count="" | |
| query1_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' and TABLE_NAME in ("+(', '.join(f"'{table}'" for table in table_selector))+") ORDER BY TABLE_NAME ASC" | |
| st.session_state.tab_names=list(pd.read_sql_query(query1_1,con=conn1)['TABLE_NAME']) | |
| st.session_state.dataframes = {} | |
| st.session_state.col_names = [] | |
| for tab in st.session_state.tab_names: | |
| query2_2= "select "+count+" * from ["+select2+"].["+tab+"]" | |
| st.session_state.dataframes[f'{tab}'] = pd.read_sql_query(query2_2,con=conn1) | |
| st.session_state.col_names = st.session_state.col_names + list(st.session_state.dataframes[f'{tab}'].columns) | |
| #st.session_state.data_load = "Yes" | |
| tab_names = st.session_state.tab_names | |
| dataframes = st.session_state.dataframes | |
| col_names = st.session_state.col_names | |
| metadata = MultiTableMetadata() | |
| metadata.detect_from_dataframes( | |
| data= st.session_state.dataframes | |
| ) | |
| multi_python_dict = metadata.to_dict() | |
| st.markdown(f"System has ingested :orange[**{str(len(tab_names))} tables**] from the source. Please proceed with the discovery.") | |
| #st.subheader("DATA CATALOGUE") | |
| tab1, tab2= st.tabs(["Explain Tables", "Show Relationships"]) | |
| def view_callback(): | |
| st.session_state.tdet = False | |
| with tab1: | |
| #st.write(python_dict) | |
| st.session_state.table_list= pd.DataFrame(tab_names,columns=['TABLE NAME']) | |
| containter_length = (len(st.session_state.table_list) + 1)*35 | |
| tab_names_shown= list(st.session_state.table_list['TABLE NAME'].values) | |
| tabs2= st.tabs(tab_names_shown) | |
| for i, tab in enumerate(tabs2): | |
| with tab: | |
| with st.container(height= 400, border=True): | |
| cole1,cole2=st.columns([1,1.5]) | |
| with cole1: | |
| conn = pymssql.connect("Driver={ODBC Driver 17 for SQL Server};" | |
| "Server=sql-ext-dev-uks-001.database.windows.net;" | |
| "Database=sqldb-ext-dev-uks-001;" | |
| "UID=dbadmin;" | |
| "PWD=mYpa$$w0rD" ) | |
| table_selector= tab_names_shown[i] | |
| if table_selector is not None: | |
| query2="select "+count+" * from [dbo].["+table_selector+"]" | |
| #df = pd.read_sql_query(query2,con=conn) | |
| df = st.session_state.dataframes[table_selector] | |
| selected_df = pd.DataFrame() | |
| for col in df.columns: | |
| # Filter non-null and non-blank values | |
| non_null_values = df[col][df[col] != ''].dropna().astype(str).str.strip() | |
| # Select up to 10 values (or fewer if less than 10 non-null values) | |
| selected_values = list(non_null_values[:10]) | |
| selected_values = selected_values + [""] * (10 - len(selected_values)) | |
| # Add selected values to the new dataframe | |
| selected_df[col] = selected_values | |
| #st.dataframe(selected_df) | |
| null_columns = [col for col in selected_df.columns if selected_df.apply(lambda x: x == '')[col].nunique() > 1] | |
| null_mes= "**The Following columns have very few records(less than 10). You might exclude them (if they are redundant) for better table discovery:** \n\n" | |
| for col in null_columns[:-1]: | |
| null_mes += f":orange[**{col}**]" + ', ' | |
| for collast in null_columns[-1:]: | |
| if len(null_columns)> 1: | |
| null_mes += '**and** ' + f":orange[**{collast}**]" | |
| else: | |
| null_mes += f":orange[**{collast}**]" | |
| if len(null_columns) != 0: | |
| with st.expander("🛈 Potential redundant Columns Found in Terms of Data Completeness:", expanded= True): | |
| st.markdown(null_mes) | |
| inf_filter= st.multiselect('Select Incomplete and Insignificant Columns to exclude:', list(null_columns)) | |
| run = st.button('Check', key= f"{tab_names_shown[i]}") | |
| else: | |
| st.success("No redundant Columns Found in Terms of Data Completeness") | |
| inf_filter= None | |
| run = False | |
| if inf_filter is not None: | |
| df.drop(columns=inf_filter, inplace=True) | |
| selected_df.drop(columns=inf_filter, inplace=True) | |
| if run or len(null_columns) == 0: | |
| main_list=df.columns.to_list() | |
| sub_list=['ID','LOADID','FILE_NAME'] | |
| if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): | |
| df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) | |
| conn.close() | |
| sin_metadata = SingleTableMetadata() | |
| sin_metadata.detect_from_dataframe(df) | |
| python_dict = sin_metadata.to_dict() | |
| if f'cont_{table_selector}' not in st.session_state: | |
| with st.spinner("Processing Table"): | |
| # Create a GenerativeModel instance | |
| genai_mod = genai.GenerativeModel( | |
| model_name='models/gemini-pro' | |
| ) | |
| if 'primary_key' in python_dict: | |
| primary_key = python_dict['primary_key'] | |
| else: | |
| primary_key = "Could Not be Identified" | |
| story = f""" Details of the table: | |
| table columns: {str(list(df.columns))} | |
| column datatypes: {str(df.dtypes.to_string())} | |
| table sample data: {selected_df.head(10).to_string()} | |
| """ | |
| response = genai_mod.generate_content(textwrap.dedent(""" | |
| You are a Data Migration expert. You can analyze and understand any table/data/ Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data can be any kind of generic data. you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only. | |
| Important: Please consider All fields are mandetorily during your analysis. Explain all fields precisely without unnecessary and irrelevant information. NO NEED TO PROVIDE THE SAMPLE DATA AGAIN. | |
| Here is the table details: | |
| """) + story + f"The Primary Key is:{primary_key}" , | |
| safety_settings={ | |
| HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, | |
| HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, | |
| HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, | |
| HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, | |
| }) | |
| st.session_state[f'cont_{table_selector}'] = response.text | |
| st.markdown(st.session_state[f'cont_{table_selector}']) | |
| with cole2: | |
| st.markdown("**DATA PREVIEW**") | |
| st.dataframe(df, use_container_width= True) | |
| with tab2: | |
| metadata1 = MultiTableMetadata() | |
| metadata1.detect_from_dataframes( | |
| data= st.session_state.dataframes | |
| ) | |
| multi_python_dict1 = metadata1.to_dict() | |
| rlist1=multi_python_dict1['relationships'] | |
| rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT TABLE RELATIONSHIP COLUMN','CHILD TABLE RELATIONSHIP COLUMN','CARDINALITY']) | |
| for i in range(len(rlist1)): | |
| rlist=rlist1[i] | |
| nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT TABLE RELATIONSHIP COLUMN':rlist['parent_primary_key'],'CHILD TABLE RELATIONSHIP COLUMN':rlist['child_foreign_key']},index=[i]) | |
| rdf=pd.concat([rdf,nrow],ignore_index=True) | |
| rdf['CARDINALITY'] = rdf.apply( | |
| lambda row: cardinality( | |
| st.session_state.dataframes[str(row['PARENT TABLE'])], | |
| st.session_state.dataframes[str(row['CHILD TABLE'])], | |
| str(row['PARENT TABLE RELATIONSHIP COLUMN']), | |
| str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1) | |
| if 'rdf' not in st.session_state: | |
| st.session_state.rdf = rdf | |
| edited_map_df = st.data_editor( | |
| st.session_state.rdf, | |
| column_config={ | |
| "PARENT TABLE": st.column_config.SelectboxColumn( | |
| "Available Parent Table", | |
| width="medium", | |
| options=tab_names, | |
| required=True, | |
| ), | |
| "CHILD TABLE": st.column_config.SelectboxColumn( | |
| "Available Child Table", | |
| width="medium", | |
| options=tab_names, | |
| required=True, | |
| ), | |
| "PARENT TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn( | |
| "Available Parent Table Relationship Column", | |
| width="medium", | |
| options=col_names, | |
| required=True, | |
| ), | |
| "CHILD TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn( | |
| "Available Child Table Relationship Column", | |
| width="medium", | |
| options=col_names, | |
| required=True, | |
| ), | |
| "CARDINALITY": st.column_config.SelectboxColumn( | |
| "Cardinality", | |
| width="medium", | |
| options=['1:1','1:N','N:1','N:N'], | |
| required=True, | |
| ) | |
| }, | |
| hide_index=True, | |
| num_rows = 'dynamic', | |
| use_container_width = True | |
| ) | |
| for i,row in edited_map_df.iterrows(): | |
| pcolchecklist = st.session_state.dataframes[str(row['PARENT TABLE'])].columns | |
| ccolchecklist = st.session_state.dataframes[str(row['CHILD TABLE'])].columns | |
| pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values) | |
| cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values) | |
| match = [val for val in pvals if val in cvals] | |
| #st.write(match) | |
| if row['PARENT TABLE RELATIONSHIP COLUMN'] not in pcolchecklist: | |
| st.error(f"{row['PARENT TABLE RELATIONSHIP COLUMN']} does not belong to {row['PARENT TABLE']}") | |
| else: | |
| pass | |
| if row['CHILD TABLE RELATIONSHIP COLUMN'] not in ccolchecklist: | |
| st.error(f"{row['CHILD TABLE RELATIONSHIP COLUMN']} does not belong to {row['CHILD TABLE']}") | |
| else: | |
| pass | |
| if (row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist): | |
| pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values) | |
| cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values) | |
| match = [val for val in pvals if val in cvals] | |
| if match == []: | |
| st.error(f"The Joining Condition Between column: {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} and column: {row['CHILD TABLE RELATIONSHIP COLUMN']} from Table: {row['CHILD TABLE']} does not yield any record. ") | |
| if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and (match != []): | |
| # primary_check = len(list(dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)) == dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].nunique() | |
| # if primary_check: | |
| # pass | |
| # else: | |
| # st.error(f"The Column {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} has duplicate records and hence can not be considered as Primary Key.") | |
| pass | |
| add = st.button("Add Relationship", key='add') | |
| if add: | |
| if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and ((match != [])): | |
| add_df = edited_map_df | |
| else: | |
| add_df = st.session_state.rdf | |
| else: | |
| add_df = st.session_state.rdf | |
| add_df['CARDINALITY'] = add_df.apply( | |
| lambda row: cardinality( | |
| st.session_state.dataframes[str(row['PARENT TABLE'])], | |
| st.session_state.dataframes[str(row['CHILD TABLE'])], | |
| str(row['PARENT TABLE RELATIONSHIP COLUMN']), | |
| str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1) | |
| st.session_state.add_df = add_df | |
| edited_map_df = st.session_state.add_df | |
| rel_tabs = list(add_df['PARENT TABLE'].values) + list(add_df['CHILD TABLE'].values) | |
| unrel_tabs = [tab for tab in tab_names if tab not in rel_tabs] | |
| st.info(f"""Unrelated tables due to undetected pattern: {str(unrel_tabs).replace("[","").replace("]","")}""") | |
| G, table_columns = create_er_diagram(st.session_state.add_df) | |
| img_bytes= draw_er_diagram(G, table_columns) | |
| col21, col22= st.columns([1,8]) | |
| with col21: | |
| if st.button("Regenerate"): | |
| st.rerun() | |
| with col22: | |
| st.download_button( | |
| label="Download ER Diagram", | |
| data=img_bytes, | |
| file_name="er_diagram.png", | |
| mime="image/png" | |
| ) | |