awacke1 commited on
Commit
d725a10
Β·
verified Β·
1 Parent(s): a96a313

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -0
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ import glob
5
+ import matplotlib.pyplot as plt
6
+ import base64
7
+ from io import BytesIO, StringIO
8
+
9
+ # --- Configuration & Data Loading ---
10
+
11
+ st.set_page_config(
12
+ page_title="πŸ©ΊπŸ” NPI Specialty Search Engine",
13
+ page_icon="οΏ½",
14
+ layout="wide",
15
+ initial_sidebar_state="expanded",
16
+ menu_items={
17
+ 'Get Help': 'https://huggingface.co/awacke1',
18
+ 'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
19
+ 'About': "# πŸ©ΊπŸ” Care Team Finder By Aaron Wacker - https://huggingface.co/awacke1"
20
+ }
21
+ )
22
+
23
+ # Headers for the NPI CSV files
24
+ HEADERS = [
25
+ "NPI", "EntityTypeCode", "ReplacementNPI", "EmployerIdentificationNumberEIN",
26
+ "ProviderOrganizationNameLegalBusinessName", "ProviderLastNameLegalName",
27
+ "ProviderFirstName", "ProviderMiddleName", "ProviderNamePrefixText",
28
+ "ProviderNameSuffixText", "ProviderCredentialText", "ProviderOtherOrganizationName",
29
+ "ProviderOtherOrganizationNameTypeCode", "ProviderOtherLastName", "ProviderOtherFirstName",
30
+ "ProviderOtherMiddleName", "ProviderOtherNamePrefixText", "ProviderOtherNameSuffixText",
31
+ "ProviderOtherCredentialText", "ProviderOtherLastNameTypeCode",
32
+ "ProviderFirstLineBusinessMailingAddress", "ProviderSecondLineBusinessMailingAddress",
33
+ "ProviderBusinessMailingAddressCityName", "ProviderBusinessMailingAddressStateName",
34
+ "ProviderBusinessMailingAddressPostalCode", "ProviderBusinessMailingAddressCountryCodeIfoutsideUS",
35
+ "ProviderBusinessMailingAddressTelephoneNumber", "ProviderBusinessMailingAddressFaxNumber",
36
+ "ProviderFirstLineBusinessPracticeLocationAddress", "ProviderSecondLineBusinessPracticeLocationAddress",
37
+ "ProviderBusinessPracticeLocationAddressCityName", "ProviderBusinessPracticeLocationAddressStateName",
38
+ "ProviderBusinessPracticeLocationAddressPostalCode", "ProviderBusinessPracticeLocationAddressCountryCodeIfoutsideUS",
39
+ "ProviderBusinessPracticeLocationAddressTelephoneNumber", "ProviderBusinessPracticeLocationAddressFaxNumber",
40
+ "ProviderEnumerationDate", "LastUpdateDate", "NPIDeactivationReasonCode",
41
+ "NPIDeactivationDate", "NPIReactivationDate", "ProviderGenderCode",
42
+ "AuthorizedOfficialLastName", "AuthorizedOfficialFirstName", "AuthorizedOfficialMiddleName",
43
+ "AuthorizedOfficialTitleorPosition", "AuthorizedOfficialTelephoneNumber",
44
+ "HealthcareProviderTaxonomyCode", "ProviderLicenseNumber", "ProviderLicenseNumberStateCode",
45
+ "HealthcareProviderPrimaryTaxonomySwitch"
46
+ ]
47
+ # Add columns for other taxonomies, up to 15
48
+ for i in range(2, 16):
49
+ HEADERS.extend([
50
+ f"HealthcareProviderTaxonomyCode_{i}",
51
+ f"ProviderLicenseNumber_{i}",
52
+ f"ProviderLicenseNumberStateCode_{i}",
53
+ f"HealthcareProviderPrimaryTaxonomySwitch_{i}"
54
+ ])
55
+
56
+
57
+ @st.cache_resource
58
+ def load_specialties(csv_file='Provider-Specialty.csv'):
59
+ """Loads the provider specialty taxonomy from a CSV file."""
60
+ try:
61
+ # Assuming the CSV has columns: Code, Grouping, Classification, Specialization, Display Name, Definition
62
+ return pd.read_csv(csv_file)
63
+ except FileNotFoundError:
64
+ st.error(f"Error: The specialty file '{csv_file}' was not found. Please make sure it's in the correct directory.")
65
+ return pd.DataFrame()
66
+
67
+ @st.cache_resource
68
+ def find_state_files():
69
+ """Finds all available state CSV files in the current directory."""
70
+ return [file for file in glob.glob('./*.csv') if len(os.path.basename(file).split('.')[0]) == 2]
71
+
72
+ # --- Helper Functions ---
73
+ def get_excel_download_link(df, filename="data.xlsx", text="Download as Excel"):
74
+ """Generates a link to download a pandas DataFrame as an Excel file."""
75
+ output = BytesIO()
76
+ try:
77
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
78
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
79
+ excel_data = output.getvalue()
80
+ b64 = base64.b64encode(excel_data).decode()
81
+ href = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">{text}</a>'
82
+ return href
83
+ except Exception as e:
84
+ st.error(f"Failed to create Excel file: {e}")
85
+ return ""
86
+
87
+ # --- Core Search Logic ---
88
+
89
+ def search_providers_by_specialty(specialty_codes, states_to_search, specialties_df):
90
+ """Searches for providers by specialty codes across selected states."""
91
+ results = []
92
+ city_counts = {}
93
+ for state in states_to_search:
94
+ file_path = f'./{state}.csv'
95
+ if not os.path.exists(file_path):
96
+ st.warning(f"Data file for {state} not found. Skipping.")
97
+ continue
98
+ try:
99
+ # When searching by specialty, we only need the primary taxonomy code
100
+ state_df = pd.read_csv(file_path, header=None, names=HEADERS, usecols=['HealthcareProviderTaxonomyCode'] + [h for h in HEADERS if h != 'HealthcareProviderTaxonomyCode'], low_memory=False, dtype=str)
101
+ filtered_df = state_df[state_df['HealthcareProviderTaxonomyCode'].isin(specialty_codes)]
102
+
103
+ if not filtered_df.empty:
104
+ for city, count in filtered_df['ProviderBusinessPracticeLocationAddressCityName'].value_counts().items():
105
+ city_counts[city] = city_counts.get(city, 0) + count
106
+
107
+ for code in filtered_df['HealthcareProviderTaxonomyCode'].unique():
108
+ code_specific_df = filtered_df[filtered_df['HealthcareProviderTaxonomyCode'] == code].copy()
109
+ display_info_rows = specialties_df[specialties_df['Code'] == code]
110
+ if not display_info_rows.empty:
111
+ display_info = display_info_rows[['Code', 'Grouping', 'Classification', 'Display Name']].iloc[0].to_dict()
112
+ results.append((state, display_info, code_specific_df))
113
+
114
+ except Exception as e:
115
+ st.error(f"An error occurred while processing the file for {state}: {e}")
116
+ return results, city_counts
117
+
118
+ def search_by_name(name_keyword, states_to_search):
119
+ """
120
+ Performs a text search for a keyword and returns a parsed DataFrame of matching providers.
121
+ """
122
+ all_matching_lines = []
123
+ for state in states_to_search:
124
+ file_path = f'./{state}.csv'
125
+ if not os.path.exists(file_path):
126
+ continue
127
+ try:
128
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
129
+ matching_lines = [line for line in f if name_keyword.lower() in line.lower()]
130
+ all_matching_lines.extend(matching_lines)
131
+ except Exception as e:
132
+ st.error(f"An error occurred while reading the file for {state}: {e}")
133
+
134
+ if not all_matching_lines:
135
+ return pd.DataFrame()
136
+
137
+ # Convert list of CSV lines into a single string, then into a DataFrame
138
+ csv_data = "\n".join(all_matching_lines)
139
+ # Use dynamic headers up to the max number of columns in the found data
140
+ # Peek at the first line to determine the number of columns
141
+ max_cols = max(len(line.split(',')) for line in all_matching_lines)
142
+ dynamic_headers = HEADERS[:max_cols]
143
+
144
+ results_df = pd.read_csv(StringIO(csv_data), header=None, names=dynamic_headers, low_memory=False, dtype=str)
145
+ return results_df
146
+
147
+ # --- Streamlit UI ---
148
+
149
+ st.markdown("# πŸ©ΊπŸ” NPI Specialty Search Engine")
150
+ st.markdown("#### Search for healthcare providers by specialty or name across multiple states.")
151
+ st.markdown("---")
152
+
153
+ specialties = load_specialties()
154
+ if specialties.empty:
155
+ st.stop()
156
+
157
+ # Let user choose search mode
158
+ search_type = st.radio("Select Search Type:", ('Search by Specialty', 'Search by Name'), horizontal=True)
159
+
160
+ state_files = find_state_files()
161
+ state_options = sorted([os.path.basename(file).split('.')[0] for file in state_files])
162
+ default_states = ["NM", "TX", "FL", "MN", "WI"]
163
+ valid_default_states = [state for state in default_states if state in state_options]
164
+
165
+ if search_type == 'Search by Specialty':
166
+ # --- Specialty Search UI & Logic ---
167
+ with st.form(key='specialty_search_form'):
168
+ col1, col2 = st.columns([4, 1])
169
+ with col1:
170
+ search_keyword = st.text_input('Enter a specialty keyword to search πŸ”', placeholder="e.g., Cardiology, Pediatrics")
171
+ with col2:
172
+ st.write("") # Spacer
173
+ submit_search = st.form_submit_button(label='Go πŸš€')
174
+ selected_states = st.multiselect('Select States to Search πŸ—ΊοΈ', options=state_options, default=valid_default_states)
175
+
176
+ if submit_search:
177
+ if not search_keyword: st.warning("Please enter a specialty keyword.")
178
+ elif not selected_states: st.warning("Please select at least one state.")
179
+ else:
180
+ mask = specialties.apply(lambda row: row.astype(str).str.contains(search_keyword, case=False).any(), axis=1)
181
+ final_specialty_df = specialties[mask]
182
+
183
+ if final_specialty_df.empty:
184
+ st.error(f"No specialties found for '{search_keyword}'.")
185
+ else:
186
+ st.markdown(f"##### Found {len(final_specialty_df)} specialty matches for '{search_keyword}'")
187
+ st.dataframe(final_specialty_df[['Code', 'Display Name', 'Classification']])
188
+ specialty_codes = final_specialty_df['Code'].tolist()
189
+
190
+ with st.spinner(f"Searching for providers in {', '.join(selected_states)}..."):
191
+ state_data, city_counts = search_providers_by_specialty(specialty_codes, selected_states, specialties)
192
+
193
+ if state_data:
194
+ st.success(f"Found providers in {len(state_data)} matching category/state combination(s).")
195
+ st.markdown("---")
196
+
197
+ for state, info, df in state_data:
198
+ expander_title = f"**{state}** | {info['Classification']} ({info['Code']}) - {len(df)} Providers Found"
199
+ with st.expander(expander_title):
200
+ # Create summary df with robust string concatenation
201
+ summary_df = pd.DataFrame({
202
+ "Provider Name": (df["ProviderFirstName"].fillna('') + " " + df["ProviderLastNameLegalName"].fillna('')).str.strip(),
203
+ "Organization Name": df["ProviderOrganizationNameLegalBusinessName"].fillna(''),
204
+ "NPI": df["NPI"],
205
+ "License Number": (df["ProviderLicenseNumber"].fillna('') + " (" + df["ProviderLicenseNumberStateCode"].fillna('') + ")").str.strip(),
206
+ "Address": (df["ProviderFirstLineBusinessPracticeLocationAddress"].fillna('') + ", " + df["ProviderBusinessPracticeLocationAddressCityName"].fillna('') + ", " + df["ProviderBusinessPracticeLocationAddressStateName"].fillna('') + " " + df["ProviderBusinessPracticeLocationAddressPostalCode"].fillna('')).str.strip(', '),
207
+ "Phone": df["ProviderBusinessPracticeLocationAddressTelephoneNumber"].fillna('')
208
+ })
209
+
210
+ tab1, tab2 = st.tabs(["Physician Summary", "Full Details"])
211
+ with tab1:
212
+ st.dataframe(summary_df)
213
+ with tab2:
214
+ st.dataframe(df)
215
+
216
+ # Correctly pass the summary_df to the download link function
217
+ file_name = f"{state}_{info['Display Name'].replace(' ', '_').replace('/', '_')}-{info['Code']}_summary.xlsx"
218
+ excel_link = get_excel_download_link(summary_df, filename=file_name, text=f"πŸ“₯ Download Summary for {state}")
219
+ st.markdown(excel_link, unsafe_allow_html=True)
220
+
221
+ if city_counts:
222
+ st.markdown("---")
223
+ st.subheader("Provider Counts by City (Across All Selected States)")
224
+ top_cities = dict(sorted(city_counts.items(), key=lambda item: item[1], reverse=True)[:25])
225
+ fig, ax = plt.subplots(figsize=(12, 8))
226
+ ax.bar(top_cities.keys(), top_cities.values(), color='skyblue')
227
+ ax.set_title('Top 25 Cities by Provider Count', fontsize=16)
228
+ plt.xticks(rotation=45, ha='right')
229
+ st.pyplot(fig)
230
+ else:
231
+ st.info(f"No matching provider records found for '{search_keyword}'.")
232
+
233
+ elif search_type == 'Search by Name':
234
+ # --- Name Search UI & Logic (REVISED) ---
235
+ with st.form(key='name_search_form'):
236
+ col1, col2 = st.columns([4, 1])
237
+ with col1:
238
+ name_keyword = st.text_input('Enter a provider or organization name to search πŸ”', placeholder="e.g., Carlucci, Mayo")
239
+ with col2:
240
+ st.write("") # Spacer
241
+ submit_name_search = st.form_submit_button(label='Go πŸš€')
242
+ selected_states_name = st.multiselect('Select States to Search πŸ—ΊοΈ', options=state_options, default=valid_default_states, key="name_search_states")
243
+
244
+ if submit_name_search:
245
+ if not name_keyword: st.warning("Please enter a name to search.")
246
+ elif not selected_states_name: st.warning("Please select at least one state.")
247
+ else:
248
+ with st.spinner(f"Searching for '{name_keyword}' in {', '.join(selected_states_name)}..."):
249
+ results_df = search_by_name(name_keyword, selected_states_name)
250
+
251
+ if not results_df.empty:
252
+ st.success(f"Found {len(results_df)} records containing '{name_keyword}'.")
253
+ st.markdown("---")
254
+
255
+ # Create and display a summary of the found records
256
+ summary_df = pd.DataFrame({
257
+ "Provider Name": (results_df["ProviderFirstName"].fillna('') + " " + results_df["ProviderLastNameLegalName"].fillna('')).str.strip(),
258
+ "Organization Name": results_df["ProviderOrganizationNameLegalBusinessName"].fillna(''),
259
+ "NPI": results_df["NPI"],
260
+ "Primary Specialty Code": results_df["HealthcareProviderTaxonomyCode"].fillna(''),
261
+ "City": results_df["ProviderBusinessPracticeLocationAddressCityName"].fillna(''),
262
+ "State": results_df["ProviderBusinessPracticeLocationAddressStateName"].fillna(''),
263
+ "Phone": results_df["ProviderBusinessPracticeLocationAddressTelephoneNumber"].fillna('')
264
+ })
265
+ st.dataframe(summary_df)
266
+
267
+ # Download link for the summary
268
+ file_name = f"{name_keyword.replace(' ', '_')}_name_search_summary.xlsx"
269
+ excel_link = get_excel_download_link(summary_df, filename=file_name, text=f"πŸ“₯ Download Summary")
270
+ st.markdown(excel_link, unsafe_allow_html=True)
271
+
272
+ # --- NEW: Specialty Code Synopsis Section ---
273
+ st.markdown("---")
274
+ st.subheader("Specialty Code Synopsis")
275
+ st.markdown("This section explains the specialty codes found in the search results above.")
276
+
277
+ # Collect all unique taxonomy codes from the results
278
+ taxonomy_cols = [col for col in results_df.columns if 'HealthcareProviderTaxonomyCode' in col]
279
+ unique_codes = pd.unique(results_df[taxonomy_cols].values.ravel('K'))
280
+ unique_codes = [code for code in unique_codes if pd.notna(code)]
281
+
282
+ if not unique_codes:
283
+ st.info("No specialty codes were found in the search results.")
284
+ else:
285
+ # Filter the specialties dataframe to get details for the found codes
286
+ synopsis_df = specialties[specialties['Code'].isin(unique_codes)].copy()
287
+ synopsis_df = synopsis_df[['Code', 'Display Name', 'Classification', 'Definition']].reset_index(drop=True)
288
+
289
+ if synopsis_df.empty:
290
+ st.warning("Could not find definitions for the specialty codes in the results.")
291
+ else:
292
+ st.dataframe(synopsis_df)
293
+
294
+ # Expander for the full, raw data
295
+ with st.expander("View Full (Raw) Data"):
296
+ st.dataframe(results_df)
297
+
298
+ else:
299
+ st.info(f"No records found containing '{name_keyword}' in the selected state files.")
300
+
301
+ # --- Explainer Section ---
302
+ with st.expander('🩺 Understand Provider Specialties πŸ“'):
303
+ st.markdown('''
304
+ - **Code**: A unique ID that clearly identifies each specialty. πŸ†”
305
+ - **Grouping**: The broad category or umbrella for a general area of expertise. 🏷️
306
+ - **Classification**: Specifies the type of practice within a broader category. 🎯
307
+ - **Specialization**: Details the specific focus within a classification for precise expertise. πŸ”
308
+ - **Definition**: A concise overview of the specialty's scope of practice. πŸ“–
309
+ ''')
310
+ οΏ½