Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import glob
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import base64
|
7 |
+
from io import BytesIO, StringIO
|
8 |
+
|
9 |
+
# --- Configuration & Data Loading ---
|
10 |
+
|
11 |
+
st.set_page_config(
|
12 |
+
page_title="π©Ίπ NPI Specialty Search Engine",
|
13 |
+
page_icon="οΏ½",
|
14 |
+
layout="wide",
|
15 |
+
initial_sidebar_state="expanded",
|
16 |
+
menu_items={
|
17 |
+
'Get Help': 'https://huggingface.co/awacke1',
|
18 |
+
'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
|
19 |
+
'About': "# π©Ίπ Care Team Finder By Aaron Wacker - https://huggingface.co/awacke1"
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
# Headers for the NPI CSV files
|
24 |
+
HEADERS = [
|
25 |
+
"NPI", "EntityTypeCode", "ReplacementNPI", "EmployerIdentificationNumberEIN",
|
26 |
+
"ProviderOrganizationNameLegalBusinessName", "ProviderLastNameLegalName",
|
27 |
+
"ProviderFirstName", "ProviderMiddleName", "ProviderNamePrefixText",
|
28 |
+
"ProviderNameSuffixText", "ProviderCredentialText", "ProviderOtherOrganizationName",
|
29 |
+
"ProviderOtherOrganizationNameTypeCode", "ProviderOtherLastName", "ProviderOtherFirstName",
|
30 |
+
"ProviderOtherMiddleName", "ProviderOtherNamePrefixText", "ProviderOtherNameSuffixText",
|
31 |
+
"ProviderOtherCredentialText", "ProviderOtherLastNameTypeCode",
|
32 |
+
"ProviderFirstLineBusinessMailingAddress", "ProviderSecondLineBusinessMailingAddress",
|
33 |
+
"ProviderBusinessMailingAddressCityName", "ProviderBusinessMailingAddressStateName",
|
34 |
+
"ProviderBusinessMailingAddressPostalCode", "ProviderBusinessMailingAddressCountryCodeIfoutsideUS",
|
35 |
+
"ProviderBusinessMailingAddressTelephoneNumber", "ProviderBusinessMailingAddressFaxNumber",
|
36 |
+
"ProviderFirstLineBusinessPracticeLocationAddress", "ProviderSecondLineBusinessPracticeLocationAddress",
|
37 |
+
"ProviderBusinessPracticeLocationAddressCityName", "ProviderBusinessPracticeLocationAddressStateName",
|
38 |
+
"ProviderBusinessPracticeLocationAddressPostalCode", "ProviderBusinessPracticeLocationAddressCountryCodeIfoutsideUS",
|
39 |
+
"ProviderBusinessPracticeLocationAddressTelephoneNumber", "ProviderBusinessPracticeLocationAddressFaxNumber",
|
40 |
+
"ProviderEnumerationDate", "LastUpdateDate", "NPIDeactivationReasonCode",
|
41 |
+
"NPIDeactivationDate", "NPIReactivationDate", "ProviderGenderCode",
|
42 |
+
"AuthorizedOfficialLastName", "AuthorizedOfficialFirstName", "AuthorizedOfficialMiddleName",
|
43 |
+
"AuthorizedOfficialTitleorPosition", "AuthorizedOfficialTelephoneNumber",
|
44 |
+
"HealthcareProviderTaxonomyCode", "ProviderLicenseNumber", "ProviderLicenseNumberStateCode",
|
45 |
+
"HealthcareProviderPrimaryTaxonomySwitch"
|
46 |
+
]
|
47 |
+
# Add columns for other taxonomies, up to 15
|
48 |
+
for i in range(2, 16):
|
49 |
+
HEADERS.extend([
|
50 |
+
f"HealthcareProviderTaxonomyCode_{i}",
|
51 |
+
f"ProviderLicenseNumber_{i}",
|
52 |
+
f"ProviderLicenseNumberStateCode_{i}",
|
53 |
+
f"HealthcareProviderPrimaryTaxonomySwitch_{i}"
|
54 |
+
])
|
55 |
+
|
56 |
+
|
57 |
+
@st.cache_resource
|
58 |
+
def load_specialties(csv_file='Provider-Specialty.csv'):
|
59 |
+
"""Loads the provider specialty taxonomy from a CSV file."""
|
60 |
+
try:
|
61 |
+
# Assuming the CSV has columns: Code, Grouping, Classification, Specialization, Display Name, Definition
|
62 |
+
return pd.read_csv(csv_file)
|
63 |
+
except FileNotFoundError:
|
64 |
+
st.error(f"Error: The specialty file '{csv_file}' was not found. Please make sure it's in the correct directory.")
|
65 |
+
return pd.DataFrame()
|
66 |
+
|
67 |
+
@st.cache_resource
|
68 |
+
def find_state_files():
|
69 |
+
"""Finds all available state CSV files in the current directory."""
|
70 |
+
return [file for file in glob.glob('./*.csv') if len(os.path.basename(file).split('.')[0]) == 2]
|
71 |
+
|
72 |
+
# --- Helper Functions ---
|
73 |
+
def get_excel_download_link(df, filename="data.xlsx", text="Download as Excel"):
|
74 |
+
"""Generates a link to download a pandas DataFrame as an Excel file."""
|
75 |
+
output = BytesIO()
|
76 |
+
try:
|
77 |
+
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
78 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
79 |
+
excel_data = output.getvalue()
|
80 |
+
b64 = base64.b64encode(excel_data).decode()
|
81 |
+
href = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">{text}</a>'
|
82 |
+
return href
|
83 |
+
except Exception as e:
|
84 |
+
st.error(f"Failed to create Excel file: {e}")
|
85 |
+
return ""
|
86 |
+
|
87 |
+
# --- Core Search Logic ---
|
88 |
+
|
89 |
+
def search_providers_by_specialty(specialty_codes, states_to_search, specialties_df):
|
90 |
+
"""Searches for providers by specialty codes across selected states."""
|
91 |
+
results = []
|
92 |
+
city_counts = {}
|
93 |
+
for state in states_to_search:
|
94 |
+
file_path = f'./{state}.csv'
|
95 |
+
if not os.path.exists(file_path):
|
96 |
+
st.warning(f"Data file for {state} not found. Skipping.")
|
97 |
+
continue
|
98 |
+
try:
|
99 |
+
# When searching by specialty, we only need the primary taxonomy code
|
100 |
+
state_df = pd.read_csv(file_path, header=None, names=HEADERS, usecols=['HealthcareProviderTaxonomyCode'] + [h for h in HEADERS if h != 'HealthcareProviderTaxonomyCode'], low_memory=False, dtype=str)
|
101 |
+
filtered_df = state_df[state_df['HealthcareProviderTaxonomyCode'].isin(specialty_codes)]
|
102 |
+
|
103 |
+
if not filtered_df.empty:
|
104 |
+
for city, count in filtered_df['ProviderBusinessPracticeLocationAddressCityName'].value_counts().items():
|
105 |
+
city_counts[city] = city_counts.get(city, 0) + count
|
106 |
+
|
107 |
+
for code in filtered_df['HealthcareProviderTaxonomyCode'].unique():
|
108 |
+
code_specific_df = filtered_df[filtered_df['HealthcareProviderTaxonomyCode'] == code].copy()
|
109 |
+
display_info_rows = specialties_df[specialties_df['Code'] == code]
|
110 |
+
if not display_info_rows.empty:
|
111 |
+
display_info = display_info_rows[['Code', 'Grouping', 'Classification', 'Display Name']].iloc[0].to_dict()
|
112 |
+
results.append((state, display_info, code_specific_df))
|
113 |
+
|
114 |
+
except Exception as e:
|
115 |
+
st.error(f"An error occurred while processing the file for {state}: {e}")
|
116 |
+
return results, city_counts
|
117 |
+
|
118 |
+
def search_by_name(name_keyword, states_to_search):
|
119 |
+
"""
|
120 |
+
Performs a text search for a keyword and returns a parsed DataFrame of matching providers.
|
121 |
+
"""
|
122 |
+
all_matching_lines = []
|
123 |
+
for state in states_to_search:
|
124 |
+
file_path = f'./{state}.csv'
|
125 |
+
if not os.path.exists(file_path):
|
126 |
+
continue
|
127 |
+
try:
|
128 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
129 |
+
matching_lines = [line for line in f if name_keyword.lower() in line.lower()]
|
130 |
+
all_matching_lines.extend(matching_lines)
|
131 |
+
except Exception as e:
|
132 |
+
st.error(f"An error occurred while reading the file for {state}: {e}")
|
133 |
+
|
134 |
+
if not all_matching_lines:
|
135 |
+
return pd.DataFrame()
|
136 |
+
|
137 |
+
# Convert list of CSV lines into a single string, then into a DataFrame
|
138 |
+
csv_data = "\n".join(all_matching_lines)
|
139 |
+
# Use dynamic headers up to the max number of columns in the found data
|
140 |
+
# Peek at the first line to determine the number of columns
|
141 |
+
max_cols = max(len(line.split(',')) for line in all_matching_lines)
|
142 |
+
dynamic_headers = HEADERS[:max_cols]
|
143 |
+
|
144 |
+
results_df = pd.read_csv(StringIO(csv_data), header=None, names=dynamic_headers, low_memory=False, dtype=str)
|
145 |
+
return results_df
|
146 |
+
|
147 |
+
# --- Streamlit UI ---
|
148 |
+
|
149 |
+
st.markdown("# π©Ίπ NPI Specialty Search Engine")
|
150 |
+
st.markdown("#### Search for healthcare providers by specialty or name across multiple states.")
|
151 |
+
st.markdown("---")
|
152 |
+
|
153 |
+
specialties = load_specialties()
|
154 |
+
if specialties.empty:
|
155 |
+
st.stop()
|
156 |
+
|
157 |
+
# Let user choose search mode
|
158 |
+
search_type = st.radio("Select Search Type:", ('Search by Specialty', 'Search by Name'), horizontal=True)
|
159 |
+
|
160 |
+
state_files = find_state_files()
|
161 |
+
state_options = sorted([os.path.basename(file).split('.')[0] for file in state_files])
|
162 |
+
default_states = ["NM", "TX", "FL", "MN", "WI"]
|
163 |
+
valid_default_states = [state for state in default_states if state in state_options]
|
164 |
+
|
165 |
+
if search_type == 'Search by Specialty':
|
166 |
+
# --- Specialty Search UI & Logic ---
|
167 |
+
with st.form(key='specialty_search_form'):
|
168 |
+
col1, col2 = st.columns([4, 1])
|
169 |
+
with col1:
|
170 |
+
search_keyword = st.text_input('Enter a specialty keyword to search π', placeholder="e.g., Cardiology, Pediatrics")
|
171 |
+
with col2:
|
172 |
+
st.write("") # Spacer
|
173 |
+
submit_search = st.form_submit_button(label='Go π')
|
174 |
+
selected_states = st.multiselect('Select States to Search πΊοΈ', options=state_options, default=valid_default_states)
|
175 |
+
|
176 |
+
if submit_search:
|
177 |
+
if not search_keyword: st.warning("Please enter a specialty keyword.")
|
178 |
+
elif not selected_states: st.warning("Please select at least one state.")
|
179 |
+
else:
|
180 |
+
mask = specialties.apply(lambda row: row.astype(str).str.contains(search_keyword, case=False).any(), axis=1)
|
181 |
+
final_specialty_df = specialties[mask]
|
182 |
+
|
183 |
+
if final_specialty_df.empty:
|
184 |
+
st.error(f"No specialties found for '{search_keyword}'.")
|
185 |
+
else:
|
186 |
+
st.markdown(f"##### Found {len(final_specialty_df)} specialty matches for '{search_keyword}'")
|
187 |
+
st.dataframe(final_specialty_df[['Code', 'Display Name', 'Classification']])
|
188 |
+
specialty_codes = final_specialty_df['Code'].tolist()
|
189 |
+
|
190 |
+
with st.spinner(f"Searching for providers in {', '.join(selected_states)}..."):
|
191 |
+
state_data, city_counts = search_providers_by_specialty(specialty_codes, selected_states, specialties)
|
192 |
+
|
193 |
+
if state_data:
|
194 |
+
st.success(f"Found providers in {len(state_data)} matching category/state combination(s).")
|
195 |
+
st.markdown("---")
|
196 |
+
|
197 |
+
for state, info, df in state_data:
|
198 |
+
expander_title = f"**{state}** | {info['Classification']} ({info['Code']}) - {len(df)} Providers Found"
|
199 |
+
with st.expander(expander_title):
|
200 |
+
# Create summary df with robust string concatenation
|
201 |
+
summary_df = pd.DataFrame({
|
202 |
+
"Provider Name": (df["ProviderFirstName"].fillna('') + " " + df["ProviderLastNameLegalName"].fillna('')).str.strip(),
|
203 |
+
"Organization Name": df["ProviderOrganizationNameLegalBusinessName"].fillna(''),
|
204 |
+
"NPI": df["NPI"],
|
205 |
+
"License Number": (df["ProviderLicenseNumber"].fillna('') + " (" + df["ProviderLicenseNumberStateCode"].fillna('') + ")").str.strip(),
|
206 |
+
"Address": (df["ProviderFirstLineBusinessPracticeLocationAddress"].fillna('') + ", " + df["ProviderBusinessPracticeLocationAddressCityName"].fillna('') + ", " + df["ProviderBusinessPracticeLocationAddressStateName"].fillna('') + " " + df["ProviderBusinessPracticeLocationAddressPostalCode"].fillna('')).str.strip(', '),
|
207 |
+
"Phone": df["ProviderBusinessPracticeLocationAddressTelephoneNumber"].fillna('')
|
208 |
+
})
|
209 |
+
|
210 |
+
tab1, tab2 = st.tabs(["Physician Summary", "Full Details"])
|
211 |
+
with tab1:
|
212 |
+
st.dataframe(summary_df)
|
213 |
+
with tab2:
|
214 |
+
st.dataframe(df)
|
215 |
+
|
216 |
+
# Correctly pass the summary_df to the download link function
|
217 |
+
file_name = f"{state}_{info['Display Name'].replace(' ', '_').replace('/', '_')}-{info['Code']}_summary.xlsx"
|
218 |
+
excel_link = get_excel_download_link(summary_df, filename=file_name, text=f"π₯ Download Summary for {state}")
|
219 |
+
st.markdown(excel_link, unsafe_allow_html=True)
|
220 |
+
|
221 |
+
if city_counts:
|
222 |
+
st.markdown("---")
|
223 |
+
st.subheader("Provider Counts by City (Across All Selected States)")
|
224 |
+
top_cities = dict(sorted(city_counts.items(), key=lambda item: item[1], reverse=True)[:25])
|
225 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
226 |
+
ax.bar(top_cities.keys(), top_cities.values(), color='skyblue')
|
227 |
+
ax.set_title('Top 25 Cities by Provider Count', fontsize=16)
|
228 |
+
plt.xticks(rotation=45, ha='right')
|
229 |
+
st.pyplot(fig)
|
230 |
+
else:
|
231 |
+
st.info(f"No matching provider records found for '{search_keyword}'.")
|
232 |
+
|
233 |
+
elif search_type == 'Search by Name':
|
234 |
+
# --- Name Search UI & Logic (REVISED) ---
|
235 |
+
with st.form(key='name_search_form'):
|
236 |
+
col1, col2 = st.columns([4, 1])
|
237 |
+
with col1:
|
238 |
+
name_keyword = st.text_input('Enter a provider or organization name to search π', placeholder="e.g., Carlucci, Mayo")
|
239 |
+
with col2:
|
240 |
+
st.write("") # Spacer
|
241 |
+
submit_name_search = st.form_submit_button(label='Go π')
|
242 |
+
selected_states_name = st.multiselect('Select States to Search πΊοΈ', options=state_options, default=valid_default_states, key="name_search_states")
|
243 |
+
|
244 |
+
if submit_name_search:
|
245 |
+
if not name_keyword: st.warning("Please enter a name to search.")
|
246 |
+
elif not selected_states_name: st.warning("Please select at least one state.")
|
247 |
+
else:
|
248 |
+
with st.spinner(f"Searching for '{name_keyword}' in {', '.join(selected_states_name)}..."):
|
249 |
+
results_df = search_by_name(name_keyword, selected_states_name)
|
250 |
+
|
251 |
+
if not results_df.empty:
|
252 |
+
st.success(f"Found {len(results_df)} records containing '{name_keyword}'.")
|
253 |
+
st.markdown("---")
|
254 |
+
|
255 |
+
# Create and display a summary of the found records
|
256 |
+
summary_df = pd.DataFrame({
|
257 |
+
"Provider Name": (results_df["ProviderFirstName"].fillna('') + " " + results_df["ProviderLastNameLegalName"].fillna('')).str.strip(),
|
258 |
+
"Organization Name": results_df["ProviderOrganizationNameLegalBusinessName"].fillna(''),
|
259 |
+
"NPI": results_df["NPI"],
|
260 |
+
"Primary Specialty Code": results_df["HealthcareProviderTaxonomyCode"].fillna(''),
|
261 |
+
"City": results_df["ProviderBusinessPracticeLocationAddressCityName"].fillna(''),
|
262 |
+
"State": results_df["ProviderBusinessPracticeLocationAddressStateName"].fillna(''),
|
263 |
+
"Phone": results_df["ProviderBusinessPracticeLocationAddressTelephoneNumber"].fillna('')
|
264 |
+
})
|
265 |
+
st.dataframe(summary_df)
|
266 |
+
|
267 |
+
# Download link for the summary
|
268 |
+
file_name = f"{name_keyword.replace(' ', '_')}_name_search_summary.xlsx"
|
269 |
+
excel_link = get_excel_download_link(summary_df, filename=file_name, text=f"π₯ Download Summary")
|
270 |
+
st.markdown(excel_link, unsafe_allow_html=True)
|
271 |
+
|
272 |
+
# --- NEW: Specialty Code Synopsis Section ---
|
273 |
+
st.markdown("---")
|
274 |
+
st.subheader("Specialty Code Synopsis")
|
275 |
+
st.markdown("This section explains the specialty codes found in the search results above.")
|
276 |
+
|
277 |
+
# Collect all unique taxonomy codes from the results
|
278 |
+
taxonomy_cols = [col for col in results_df.columns if 'HealthcareProviderTaxonomyCode' in col]
|
279 |
+
unique_codes = pd.unique(results_df[taxonomy_cols].values.ravel('K'))
|
280 |
+
unique_codes = [code for code in unique_codes if pd.notna(code)]
|
281 |
+
|
282 |
+
if not unique_codes:
|
283 |
+
st.info("No specialty codes were found in the search results.")
|
284 |
+
else:
|
285 |
+
# Filter the specialties dataframe to get details for the found codes
|
286 |
+
synopsis_df = specialties[specialties['Code'].isin(unique_codes)].copy()
|
287 |
+
synopsis_df = synopsis_df[['Code', 'Display Name', 'Classification', 'Definition']].reset_index(drop=True)
|
288 |
+
|
289 |
+
if synopsis_df.empty:
|
290 |
+
st.warning("Could not find definitions for the specialty codes in the results.")
|
291 |
+
else:
|
292 |
+
st.dataframe(synopsis_df)
|
293 |
+
|
294 |
+
# Expander for the full, raw data
|
295 |
+
with st.expander("View Full (Raw) Data"):
|
296 |
+
st.dataframe(results_df)
|
297 |
+
|
298 |
+
else:
|
299 |
+
st.info(f"No records found containing '{name_keyword}' in the selected state files.")
|
300 |
+
|
301 |
+
# --- Explainer Section ---
|
302 |
+
with st.expander('π©Ί Understand Provider Specialties π'):
|
303 |
+
st.markdown('''
|
304 |
+
- **Code**: A unique ID that clearly identifies each specialty. π
|
305 |
+
- **Grouping**: The broad category or umbrella for a general area of expertise. π·οΈ
|
306 |
+
- **Classification**: Specifies the type of practice within a broader category. π―
|
307 |
+
- **Specialization**: Details the specific focus within a classification for precise expertise. π
|
308 |
+
- **Definition**: A concise overview of the specialty's scope of practice. π
|
309 |
+
''')
|
310 |
+
οΏ½
|