Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| import streamlit as st | |
| import sdv | |
| from sdv.datasets.local import load_csvs | |
| from sdv.metadata import MultiTableMetadata | |
| from sdv.multi_table import HMASynthesizer | |
| import time | |
| import os | |
| import gc | |
| import warnings | |
| from PIL import Image | |
| from sdv.metadata import SingleTableMetadata | |
| import pyodbc | |
| import google.generativeai as genai | |
| import textwrap | |
| from streamlit_extras.stylable_container import stylable_container | |
| genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0') | |
| genai_mod = genai.GenerativeModel( | |
| model_name='models/gemini-1.5-pro-latest' | |
| ) | |
| st.set_page_config(page_title='DATA DISCOVERY') | |
| st.title('AUTOMATED DATA CATALOGUE') | |
| st.subheader('SELECT SOURCE') | |
| select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name') | |
| if select1 =='DB_10001': | |
| datasets = load_csvs( | |
| folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE', | |
| read_csv_parameters={ | |
| 'skipinitialspace': True, | |
| 'encoding': 'utf_8' | |
| }) | |
| st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.") | |
| select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery') | |
| if select_main == 'Multi Table Discovery': | |
| with st.spinner('Performing Data Discovery'): | |
| time.sleep(2) | |
| st.success('Data cataloguing complete!') | |
| datasets = load_csvs( | |
| folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE', | |
| read_csv_parameters={ | |
| 'skipinitialspace': True, | |
| 'encoding': 'utf_8' | |
| }) | |
| metadata = MultiTableMetadata() | |
| metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE') | |
| python_dict = metadata.to_dict() | |
| st.markdown('---') | |
| st.subheader('DATA CATALOG') | |
| # st.json(python_dict) | |
| brands=datasets['brands'] | |
| categories=datasets['categories'] | |
| customers=datasets['CUSTOMER_MASTER_TBL_1'] | |
| orderitems=datasets['order_items'] | |
| orders=datasets['orders'] | |
| products=datasets['products'] | |
| staffs=datasets['staffs'] | |
| stocks=datasets['stocks'] | |
| stores=datasets['stores'] | |
| tables=python_dict['tables'] | |
| table_names=[*tables] | |
| col1, col2, col3 = st.columns([2,2,2]) | |
| with col1: | |
| def view_callback(): | |
| st.session_state.tdet = False | |
| view= st.button("LIST TABLES",key='view',on_click=view_callback) | |
| with col2: | |
| if 'tdet' not in st.session_state: | |
| st.session_state.tdet = False | |
| tdet1 = st.button("SHOW TABLE DETAILS") | |
| with col3: | |
| rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback) | |
| if tdet1: | |
| st.session_state.tdet = tdet1 | |
| if view: | |
| #st.write(python_dict) | |
| st.write(pd.DataFrame(table_names,columns=['TABLE NAME'])) | |
| if rel: | |
| rlist1=python_dict['relationships'] | |
| rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY']) | |
| for i in range(len(rlist1)): | |
| rlist=rlist1[i] | |
| nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i]) | |
| rdf=pd.concat([rdf,nrow],ignore_index=True) | |
| st.write(rdf) | |
| if st.session_state.tdet is True: | |
| def tdet_callback(): | |
| st.session_state.tdet=True | |
| st.subheader('Select table name to view') | |
| sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback) | |
| col4, col5 = st.columns([1, 3]) | |
| with col4: | |
| preview= st.button("PREVIEW TABLE",key='preview') | |
| with col5: | |
| cdet = st.button("GET COLUMN DETAILS",key='prof') | |
| if preview: | |
| st.write(datasets[sbox1]) | |
| if cdet: | |
| cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) | |
| t_dict=tables[sbox1] | |
| c_dict=t_dict['columns'] | |
| i=0 | |
| for key in c_dict: | |
| e_dict=c_dict[key] | |
| if 'pii' in e_dict: | |
| p='YES' | |
| else: | |
| p='NO' | |
| if e_dict['sdtype']=='datetime': | |
| v=e_dict['sdtype']+': '+e_dict['datetime_format'] | |
| else: | |
| v=e_dict['sdtype'] | |
| new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) | |
| cdetails=pd.concat([cdetails, new_row],ignore_index=True) | |
| i=i+1 | |
| if 'primary_key' in t_dict: | |
| st.write('Primary Key:',t_dict['primary_key']) | |
| else: | |
| st.write('Primary Key: No key can be detected') | |
| st.write(cdetails) | |
| if select_main == 'Single Table Discovery': | |
| metadata = SingleTableMetadata() | |
| conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};" | |
| "Server=ipzilnpxsssp001.database.windows.net;" | |
| "Database=Marcopolo_DB;" | |
| "UID=ssikder004;" | |
| "PWD=Marcopolo@123" ) | |
| query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC" | |
| table1=pd.read_sql_query(query1,con=conn) | |
| table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str') | |
| table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping') | |
| if table_selector is not None: | |
| st.markdown('---') | |
| query2="select * from [Client].["+table_selector+"]" | |
| df = pd.read_sql_query(query2,con=conn) | |
| main_list=df.columns.to_list() | |
| sub_list=['ID','LOADID','FILE_NAME'] | |
| if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): | |
| df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) | |
| conn.close() | |
| metadata = SingleTableMetadata() | |
| metadata.detect_from_dataframe(df) | |
| python_dict = metadata.to_dict() | |
| if f'cont_{table_selector}' not in st.session_state: | |
| with st.spinner("Processing Table"): | |
| # Create a GenerativeModel instance | |
| genai_mod = genai.GenerativeModel( | |
| model_name='models/gemini-1.5-pro-latest' | |
| ) | |
| if 'primary_key' in python_dict: | |
| primary_key = python_dict['primary_key'] | |
| else: | |
| primary_key = "Could Not be Identified" | |
| story = f""" Details of the table: | |
| table columns: {str(list(df.columns))} | |
| column datatypes: {str(df.dtypes.to_string())} | |
| table sample data: {df.head(10).to_string()} | |
| """ | |
| response = genai_mod.generate_content(textwrap.dedent(""" | |
| You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only. | |
| Important: Please consider All fields are mandetorily during your analysis. | |
| Here is the table details: | |
| """) + story + f"The Primary Key is:{primary_key}" ) | |
| st.write(response.usage_metadata) | |
| st.session_state[f'cont_{table_selector}'] = response.text | |
| with stylable_container( | |
| key=f"container_with_border", | |
| css_styles=""" | |
| { | |
| border: 1px solid white; | |
| border-radius: 0.5rem; | |
| padding: calc(1em - 1px); | |
| width: 110%; /* Set container width to 100% */ | |
| } | |
| """ | |
| ): | |
| st.write(st.session_state[f'cont_{table_selector}']) | |
| col9, col10, col11 = st.columns([2, 3, 9]) | |
| with col9: | |
| preview= st.button("PREVIEW TABLE",key='preview') | |
| # with col10: | |
| # cdet = st.button("GET COLUMN DETAILS",key='prof') | |
| if preview: | |
| st.dataframe(df) | |
| # if cdet: | |
| # cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) | |
| # c_dict=python_dict['columns'] | |
| # i=0 | |
| # for key in c_dict: | |
| # e_dict=c_dict[key] | |
| # if 'pii' in e_dict: | |
| # p='YES' | |
| # else: | |
| # p='NO' | |
| # if e_dict['sdtype']=='datetime': | |
| # v=e_dict['sdtype']+': '+e_dict['datetime_format'] | |
| # else: | |
| # v=e_dict['sdtype'] | |
| # new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) | |
| # cdetails=pd.concat([cdetails, new_row],ignore_index=True) | |
| # i=i+1 | |
| # if 'primary_key' in python_dict: | |
| # st.write('Primary Key:',python_dict['primary_key']) | |
| # else: | |
| # st.write('Primary Key: No key can be detected') | |
| # st.write(cdetails) | |