Spaces:

awacke1
/

NPI-Registry-FOIA-Specialty

Running

App Files Files Community

awacke1 commited on about 20 hours ago

Commit

d725a10

verified ·

1 Parent(s): a96a313

Create app.py

Browse files

Files changed (1) hide show

app.py +310 -0

app.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import streamlit as st
+import pandas as pd
+import os
+import glob
+import matplotlib.pyplot as plt
+import base64
+from io import BytesIO, StringIO
+# --- Configuration & Data Loading ---
+st.set_page_config(
+    page_title="🩺🔍 NPI Specialty Search Engine",
+    page_icon="�",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        'Get Help': 'https://huggingface.co/awacke1',
+        'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
+        'About': "# 🩺🔍 Care Team Finder By Aaron Wacker - https://huggingface.co/awacke1"
+    }
+)
+# Headers for the NPI CSV files
+HEADERS = [
+    "NPI", "EntityTypeCode", "ReplacementNPI", "EmployerIdentificationNumberEIN",
+    "ProviderOrganizationNameLegalBusinessName", "ProviderLastNameLegalName",
+    "ProviderFirstName", "ProviderMiddleName", "ProviderNamePrefixText",
+    "ProviderNameSuffixText", "ProviderCredentialText", "ProviderOtherOrganizationName",
+    "ProviderOtherOrganizationNameTypeCode", "ProviderOtherLastName", "ProviderOtherFirstName",
+    "ProviderOtherMiddleName", "ProviderOtherNamePrefixText", "ProviderOtherNameSuffixText",
+    "ProviderOtherCredentialText", "ProviderOtherLastNameTypeCode",
+    "ProviderFirstLineBusinessMailingAddress", "ProviderSecondLineBusinessMailingAddress",
+    "ProviderBusinessMailingAddressCityName", "ProviderBusinessMailingAddressStateName",
+    "ProviderBusinessMailingAddressPostalCode", "ProviderBusinessMailingAddressCountryCodeIfoutsideUS",
+    "ProviderBusinessMailingAddressTelephoneNumber", "ProviderBusinessMailingAddressFaxNumber",
+    "ProviderFirstLineBusinessPracticeLocationAddress", "ProviderSecondLineBusinessPracticeLocationAddress",
+    "ProviderBusinessPracticeLocationAddressCityName", "ProviderBusinessPracticeLocationAddressStateName",
+    "ProviderBusinessPracticeLocationAddressPostalCode", "ProviderBusinessPracticeLocationAddressCountryCodeIfoutsideUS",
+    "ProviderBusinessPracticeLocationAddressTelephoneNumber", "ProviderBusinessPracticeLocationAddressFaxNumber",
+    "ProviderEnumerationDate", "LastUpdateDate", "NPIDeactivationReasonCode",
+    "NPIDeactivationDate", "NPIReactivationDate", "ProviderGenderCode",
+    "AuthorizedOfficialLastName", "AuthorizedOfficialFirstName", "AuthorizedOfficialMiddleName",
+    "AuthorizedOfficialTitleorPosition", "AuthorizedOfficialTelephoneNumber",
+    "HealthcareProviderTaxonomyCode", "ProviderLicenseNumber", "ProviderLicenseNumberStateCode",
+    "HealthcareProviderPrimaryTaxonomySwitch"
+]
+# Add columns for other taxonomies, up to 15
+for i in range(2, 16):
+    HEADERS.extend([
+        f"HealthcareProviderTaxonomyCode_{i}",
+        f"ProviderLicenseNumber_{i}",
+        f"ProviderLicenseNumberStateCode_{i}",
+        f"HealthcareProviderPrimaryTaxonomySwitch_{i}"
+    ])
+@st.cache_resource
+def load_specialties(csv_file='Provider-Specialty.csv'):
+    """Loads the provider specialty taxonomy from a CSV file."""
+    try:
+        # Assuming the CSV has columns: Code, Grouping, Classification, Specialization, Display Name, Definition
+        return pd.read_csv(csv_file)
+    except FileNotFoundError:
+        st.error(f"Error: The specialty file '{csv_file}' was not found. Please make sure it's in the correct directory.")
+        return pd.DataFrame()
+@st.cache_resource
+def find_state_files():
+    """Finds all available state CSV files in the current directory."""
+    return [file for file in glob.glob('./*.csv') if len(os.path.basename(file).split('.')[0]) == 2]
+# --- Helper Functions ---
+def get_excel_download_link(df, filename="data.xlsx", text="Download as Excel"):
+    """Generates a link to download a pandas DataFrame as an Excel file."""
+    output = BytesIO()
+    try:
+        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+            df.to_excel(writer, index=False, sheet_name='Sheet1')
+        excel_data = output.getvalue()
+        b64 = base64.b64encode(excel_data).decode()
+        href = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">{text}</a>'
+        return href
+    except Exception as e:
+        st.error(f"Failed to create Excel file: {e}")
+        return ""
+# --- Core Search Logic ---
+def search_providers_by_specialty(specialty_codes, states_to_search, specialties_df):
+    """Searches for providers by specialty codes across selected states."""
+    results = []
+    city_counts = {}
+    for state in states_to_search:
+        file_path = f'./{state}.csv'
+        if not os.path.exists(file_path):
+            st.warning(f"Data file for {state} not found. Skipping.")
+            continue
+        try:
+            # When searching by specialty, we only need the primary taxonomy code
+            state_df = pd.read_csv(file_path, header=None, names=HEADERS, usecols=['HealthcareProviderTaxonomyCode'] + [h for h in HEADERS if h != 'HealthcareProviderTaxonomyCode'], low_memory=False, dtype=str)
+            filtered_df = state_df[state_df['HealthcareProviderTaxonomyCode'].isin(specialty_codes)]
+            if not filtered_df.empty:
+                for city, count in filtered_df['ProviderBusinessPracticeLocationAddressCityName'].value_counts().items():
+                    city_counts[city] = city_counts.get(city, 0) + count
+                for code in filtered_df['HealthcareProviderTaxonomyCode'].unique():
+                    code_specific_df = filtered_df[filtered_df['HealthcareProviderTaxonomyCode'] == code].copy()
+                    display_info_rows = specialties_df[specialties_df['Code'] == code]
+                    if not display_info_rows.empty:
+                         display_info = display_info_rows[['Code', 'Grouping', 'Classification', 'Display Name']].iloc[0].to_dict()
+                         results.append((state, display_info, code_specific_df))
+        except Exception as e:
+            st.error(f"An error occurred while processing the file for {state}: {e}")
+    return results, city_counts
+def search_by_name(name_keyword, states_to_search):
+    """
+    Performs a text search for a keyword and returns a parsed DataFrame of matching providers.
+    """
+    all_matching_lines = []
+    for state in states_to_search:
+        file_path = f'./{state}.csv'
+        if not os.path.exists(file_path):
+            continue
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                matching_lines = [line for line in f if name_keyword.lower() in line.lower()]
+                all_matching_lines.extend(matching_lines)
+        except Exception as e:
+            st.error(f"An error occurred while reading the file for {state}: {e}")
+    if not all_matching_lines:
+        return pd.DataFrame()
+    # Convert list of CSV lines into a single string, then into a DataFrame
+    csv_data = "\n".join(all_matching_lines)
+    # Use dynamic headers up to the max number of columns in the found data
+    # Peek at the first line to determine the number of columns
+    max_cols = max(len(line.split(',')) for line in all_matching_lines)
+    dynamic_headers = HEADERS[:max_cols]
+    results_df = pd.read_csv(StringIO(csv_data), header=None, names=dynamic_headers, low_memory=False, dtype=str)
+    return results_df
+# --- Streamlit UI ---
+st.markdown("# 🩺🔍 NPI Specialty Search Engine")
+st.markdown("#### Search for healthcare providers by specialty or name across multiple states.")
+st.markdown("---")
+specialties = load_specialties()
+if specialties.empty:
+    st.stop()
+# Let user choose search mode
+search_type = st.radio("Select Search Type:", ('Search by Specialty', 'Search by Name'), horizontal=True)
+state_files = find_state_files()
+state_options = sorted([os.path.basename(file).split('.')[0] for file in state_files])
+default_states = ["NM", "TX", "FL", "MN", "WI"]
+valid_default_states = [state for state in default_states if state in state_options]
+if search_type == 'Search by Specialty':
+    # --- Specialty Search UI & Logic ---
+    with st.form(key='specialty_search_form'):
+        col1, col2 = st.columns([4, 1])
+        with col1:
+            search_keyword = st.text_input('Enter a specialty keyword to search 🔍', placeholder="e.g., Cardiology, Pediatrics")
+        with col2:
+            st.write("") # Spacer
+            submit_search = st.form_submit_button(label='Go 🚀')
+        selected_states = st.multiselect('Select States to Search 🗺️', options=state_options, default=valid_default_states)
+    if submit_search:
+        if not search_keyword: st.warning("Please enter a specialty keyword.")
+        elif not selected_states: st.warning("Please select at least one state.")
+        else:
+            mask = specialties.apply(lambda row: row.astype(str).str.contains(search_keyword, case=False).any(), axis=1)
+            final_specialty_df = specialties[mask]
+            if final_specialty_df.empty:
+                st.error(f"No specialties found for '{search_keyword}'.")
+            else:
+                st.markdown(f"##### Found {len(final_specialty_df)} specialty matches for '{search_keyword}'")
+                st.dataframe(final_specialty_df[['Code', 'Display Name', 'Classification']])
+                specialty_codes = final_specialty_df['Code'].tolist()
+                with st.spinner(f"Searching for providers in {', '.join(selected_states)}..."):
+                    state_data, city_counts = search_providers_by_specialty(specialty_codes, selected_states, specialties)
+                    if state_data:
+                        st.success(f"Found providers in {len(state_data)} matching category/state combination(s).")
+                        st.markdown("---")
+                        for state, info, df in state_data:
+                            expander_title = f"**{state}** | {info['Classification']} ({info['Code']}) - {len(df)} Providers Found"
+                            with st.expander(expander_title):
+                                # Create summary df with robust string concatenation
+                                summary_df = pd.DataFrame({
+                                    "Provider Name": (df["ProviderFirstName"].fillna('') + " " + df["ProviderLastNameLegalName"].fillna('')).str.strip(),
+                                    "Organization Name": df["ProviderOrganizationNameLegalBusinessName"].fillna(''),
+                                    "NPI": df["NPI"],
+                                    "License Number": (df["ProviderLicenseNumber"].fillna('') + " (" + df["ProviderLicenseNumberStateCode"].fillna('') + ")").str.strip(),
+                                    "Address": (df["ProviderFirstLineBusinessPracticeLocationAddress"].fillna('') + ", " + df["ProviderBusinessPracticeLocationAddressCityName"].fillna('') + ", " + df["ProviderBusinessPracticeLocationAddressStateName"].fillna('') + " " + df["ProviderBusinessPracticeLocationAddressPostalCode"].fillna('')).str.strip(', '),
+                                    "Phone": df["ProviderBusinessPracticeLocationAddressTelephoneNumber"].fillna('')
+                                })
+                                tab1, tab2 = st.tabs(["Physician Summary", "Full Details"])
+                                with tab1:
+                                    st.dataframe(summary_df)
+                                with tab2:
+                                    st.dataframe(df)
+                                # Correctly pass the summary_df to the download link function
+                                file_name = f"{state}_{info['Display Name'].replace(' ', '_').replace('/', '_')}-{info['Code']}_summary.xlsx"
+                                excel_link = get_excel_download_link(summary_df, filename=file_name, text=f"📥 Download Summary for {state}")
+                                st.markdown(excel_link, unsafe_allow_html=True)
+                        if city_counts:
+                            st.markdown("---")
+                            st.subheader("Provider Counts by City (Across All Selected States)")
+                            top_cities = dict(sorted(city_counts.items(), key=lambda item: item[1], reverse=True)[:25])
+                            fig, ax = plt.subplots(figsize=(12, 8))
+                            ax.bar(top_cities.keys(), top_cities.values(), color='skyblue')
+                            ax.set_title('Top 25 Cities by Provider Count', fontsize=16)
+                            plt.xticks(rotation=45, ha='right')
+                            st.pyplot(fig)
+                    else:
+                        st.info(f"No matching provider records found for '{search_keyword}'.")
+elif search_type == 'Search by Name':
+    # --- Name Search UI & Logic (REVISED) ---
+    with st.form(key='name_search_form'):
+        col1, col2 = st.columns([4, 1])
+        with col1:
+            name_keyword = st.text_input('Enter a provider or organization name to search 🔍', placeholder="e.g., Carlucci, Mayo")
+        with col2:
+            st.write("") # Spacer
+            submit_name_search = st.form_submit_button(label='Go 🚀')
+        selected_states_name = st.multiselect('Select States to Search 🗺️', options=state_options, default=valid_default_states, key="name_search_states")
+    if submit_name_search:
+        if not name_keyword: st.warning("Please enter a name to search.")
+        elif not selected_states_name: st.warning("Please select at least one state.")
+        else:
+            with st.spinner(f"Searching for '{name_keyword}' in {', '.join(selected_states_name)}..."):
+                results_df = search_by_name(name_keyword, selected_states_name)
+                if not results_df.empty:
+                    st.success(f"Found {len(results_df)} records containing '{name_keyword}'.")
+                    st.markdown("---")
+                    # Create and display a summary of the found records
+                    summary_df = pd.DataFrame({
+                        "Provider Name": (results_df["ProviderFirstName"].fillna('') + " " + results_df["ProviderLastNameLegalName"].fillna('')).str.strip(),
+                        "Organization Name": results_df["ProviderOrganizationNameLegalBusinessName"].fillna(''),
+                        "NPI": results_df["NPI"],
+                        "Primary Specialty Code": results_df["HealthcareProviderTaxonomyCode"].fillna(''),
+                        "City": results_df["ProviderBusinessPracticeLocationAddressCityName"].fillna(''),
+                        "State": results_df["ProviderBusinessPracticeLocationAddressStateName"].fillna(''),
+                        "Phone": results_df["ProviderBusinessPracticeLocationAddressTelephoneNumber"].fillna('')
+                    })
+                    st.dataframe(summary_df)
+                    # Download link for the summary
+                    file_name = f"{name_keyword.replace(' ', '_')}_name_search_summary.xlsx"
+                    excel_link = get_excel_download_link(summary_df, filename=file_name, text=f"📥 Download Summary")
+                    st.markdown(excel_link, unsafe_allow_html=True)
+                    # --- NEW: Specialty Code Synopsis Section ---
+                    st.markdown("---")
+                    st.subheader("Specialty Code Synopsis")
+                    st.markdown("This section explains the specialty codes found in the search results above.")
+                    # Collect all unique taxonomy codes from the results
+                    taxonomy_cols = [col for col in results_df.columns if 'HealthcareProviderTaxonomyCode' in col]
+                    unique_codes = pd.unique(results_df[taxonomy_cols].values.ravel('K'))
+                    unique_codes = [code for code in unique_codes if pd.notna(code)]
+                    if not unique_codes:
+                        st.info("No specialty codes were found in the search results.")
+                    else:
+                        # Filter the specialties dataframe to get details for the found codes
+                        synopsis_df = specialties[specialties['Code'].isin(unique_codes)].copy()
+                        synopsis_df = synopsis_df[['Code', 'Display Name', 'Classification', 'Definition']].reset_index(drop=True)
+                        if synopsis_df.empty:
+                            st.warning("Could not find definitions for the specialty codes in the results.")
+                        else:
+                            st.dataframe(synopsis_df)
+                    # Expander for the full, raw data
+                    with st.expander("View Full (Raw) Data"):
+                        st.dataframe(results_df)
+                else:
+                    st.info(f"No records found containing '{name_keyword}' in the selected state files.")
+# --- Explainer Section ---
+with st.expander('🩺 Understand Provider Specialties 📝'):
+    st.markdown('''
+    - **Code**: A unique ID that clearly identifies each specialty. 🆔
+    - **Grouping**: The broad category or umbrella for a general area of expertise. 🏷️
+    - **Classification**: Specifies the type of practice within a broader category. 🎯
+    - **Specialization**: Details the specific focus within a classification for precise expertise. 🔍
+    - **Definition**: A concise overview of the specialty's scope of practice. 📖
+    ''')
+�