tuva-ml-models
/

medicare-inpatient-model

@@ -1,31 +1,709 @@
 {
  "metadata": {
   "kernelspec": {
    "display_name": "Streamlit Notebook",
    "name": "streamlit"
   },
   "lastEditStatus": {
-   "notebookId": "6rovstl42ft2p5id6gwo",
    "authorId": "374530764978",
    "authorName": "BRAD",
-   "authorEmail": "[email protected]",
-   "sessionId": "65561efa-4d18-4072-8f4d-10240cb902ba",
-   "lastEditTime": 1750870004305
   }
  },
- "nbformat_minor": 5,
  "nbformat": 4,
- "cells": [
-  {
-   "cell_type": "code",
-   "id": "3775908f-ca36-4846-8f38-5adca39217f2",
-   "metadata": {
-    "language": "python",
-    "name": "cell1"
-   },
-   "source": "0]}\")\n        \n        base_readmit_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, solver='liblinear')\n        \n        # Determine the feature set to use.\n        if FAST_MODE:\n            print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n            best_readmission_features = X_train_base.columns.tolist()\n        else:\n            best_readmission_features = find_best_feature_subset(\n                model=base_readmit_model, X_train=X_train_base, y_train=y_train_base, X_val=X_calib_read, y_val=y_calib_read,\n                scoring_func=roc_auc_score, higher_is_better=True, model_name=\"Readmission (Logistic Regression)\"\n            )\n\n        print(f\"\\nTraining final Readmission model pipeline using {len(best_readmission_features)} features...\")\n        base_model_for_calib = clone(base_readmit_model)\n        base_model_for_calib.fit(X_train_base[best_readmission_features], y_train_base)\n        \n        # Log feature importances from the base (uncalibrated) model.\n        uncal_read_model_name = f\"Inpatient_Readmission_Base_Uncalibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n        log_feature_importances_to_snowflake(session, base_model_for_calib, best_readmission_features, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION, FEATURE_IMPORTANCE_TABLE_NAME)\n        \n        # Evaluate and log metrics for the uncalibrated model for comparison.\n        y_pred_proba_uncal = base_model_for_calib.predict_proba(X_test_read[best_readmission_features])[:, 1]\n        uncalibrated_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_uncal)\n        log_model_metrics_to_snowflake(session, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION + \"_Probability\", uncalibrated_metrics, \"Binary_Uncalibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n        \n        # Calibrate the model on the held-out calibration set.\n        calibrated_readmission_model = CalibratedClassifierCV(base_model_for_calib, method='isotonic', cv='prefit')\n        calibrated_readmission_model.fit(X_calib_read[best_readmission_features], y_calib_read)\n        y_pred_proba_cal = calibrated_readmission_model.predict_proba(X_test_read[best_readmission_features])[:, 1]\n\n        print(\"\\nCalibrated Readmission Model - Test Set Evaluation:\")\n        calibrated_proba_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_cal)\n        for k, v in calibrated_proba_metrics.items(): print(f\"  {k}: {v:.4f}\")\n        \n        cal_read_model_name = f\"Inpatient_Readmission_Calibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n        log_model_metrics_to_snowflake(session, MODEL_RUN_ID, cal_read_model_name, TARGET_READMISSION + \"_Probability\", calibrated_proba_metrics, \"Binary_Calibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n\n# --- 4.3 Model 3: Predicting Discharge Location (Multiclass Classification) ---\nprint(\"\\n\" + \"=\"*80)\nprint(\"--- Training Model 3: Calibrated Discharge Location ---\")\nprint(\"=\"*80)\nTARGET_DISCHARGE = 'discharge_location'\ncalibrated_discharge_model, le_discharge, best_discharge_features = None, None, None\n\nif TARGET_DISCHARGE not in df_pd.columns:\n    print(f\"Error: Target column '{TARGET_DISCHARGE}' not found. Skipping Discharge Location model.\")\nelse:\n    le_discharge = LabelEncoder()\n    y_discharge_encoded = le_discharge.fit_transform(df_pd[TARGET_DISCHARGE])\n    num_classes_discharge = len(le_discharge.classes_)\n    print(f\"Discharge Location: {num_classes_discharge} classes found: {le_discharge.classes_}\")\n    \n    # Split data: 60% base train, 20% calibration, 20% test\n    stratify_discharge = y_discharge_encoded if num_classes_discharge > 1 else None\n    X_train_full_disc, X_test_disc, y_train_full_disc_enc, y_test_disc_enc = train_test_split(X, y_discharge_encoded, test_size=0.2, random_state=42, stratify=stratify_discharge)\n    X_train_base_disc, X_calib_disc, y_train_base_disc_enc, y_calib_disc_enc = train_test_split(X_train_full_disc, y_train_full_disc_enc, test_size=0.25, random_state=42, stratify=y_train_full_disc_enc if num_classes_discharge > 1 else None)\n    print(f\"Data split for discharge: Base train: {X_train_base_disc.shape[0]}, Calibration: {X_calib_disc.shape[0]}, Test: {X_test_disc.shape[0]}\")\n    \n    base_discharge_model = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', multi_class='multinomial', class_weight='balanced')\n    \n    # Determine the feature set to use.\n    if FAST_MODE:\n        print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n        best_discharge_features = X_train_base_disc.columns.tolist()\n    else:\n        best_discharge_features = find_best_feature_subset(\n            model=base_discharge_model, X_train=X_train_base_disc, y_train=y_train_base_disc_enc, X_val=X_calib_disc, y_val=y_calib_disc_enc,\n            scoring_func=log_loss, higher_is_better=False, model_name=\"Discharge Location (Multinomial Regression)\"\n        )\n\n    print(f\"\\nTraining final Discharge Location model pipeline using {len(best_discharge_features)} features...\")\n    base_model_for_calib_disc = clone(base_discharge_model)\n    base_model_for_calib_disc.fit(X_train_base_disc[best_discharge_features], y_train_base_disc_enc)\n    \n    discharge_model_name = f\"Inpatient_Discharge_Cal_Overall_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n    log_feature_importances_to_snowflake(session, base_model_for_calib_disc, best_discharge_features, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, FEATURE_IMPORTANCE_TABLE_NAME)\n    \n    # Calibrate the model. 'sigmoid' is used for one-vs-rest calibration, suitable for multiclass.\n    calibrated_discharge_model = CalibratedClassifierCV(base_model_for_calib_disc, method='sigmoid', cv='prefit')\n    calibrated_discharge_model.fit(X_calib_disc[best_discharge_features], y_calib_disc_enc)\n    y_pred_proba_discharge_calibrated = calibrated_discharge_model.predict_proba(X_test_disc[best_discharge_features])\n    y_pred_labels_discharge_calibrated = calibrated_discharge_model.predict(X_test_disc[best_discharge_features])\n    \n    print(\"\\nCalibrated Discharge Model - Test Set Evaluation:\")\n    calibrated_disc_metrics = calculate_multiclass_classification_metrics(y_test_disc_enc, y_pred_labels_discharge_calibrated, y_pred_proba_discharge_calibrated, le_discharge.classes_)\n    \n    # Log the overall multiclass metrics.\n    overall_cal_metrics_to_log = {k: v for k, v in calibrated_disc_metrics.items() if k != 'per_class_details'}\n    overall_cal_metrics_to_log['BRIER_SCORE'] = calibrated_disc_metrics.get('BRIER_SCORE_MACRO_AVG')\n    log_model_metrics_to_snowflake(session, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, overall_cal_metrics_to_log, \"Multiclass_Cal_Overall\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n    \n    # --- FIX: Log the per-class metrics by mapping keys correctly ---\n    discharge_class_model_name = f\"Inpatient_Discharge_Cal_Class_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n    for class_detail in calibrated_disc_metrics.get('per_class_details', []):\n        # Create a new dict with keys the logging function expects.\n        per_class_metrics_to_log = {\n            'BRIER_SCORE': class_detail.get('brier_score'),\n            'AVG_Y_PRED': class_detail.get('avg_pred_proba'),\n            'AVG_Y_TRUE': class_detail.get('true_proportion'),\n            'PRED_RATIO': class_detail.get('proba_ratio'),\n        }\n        log_model_metrics_to_snowflake(\n            session, MODEL_RUN_ID, discharge_class_model_name,\n            f\"{TARGET_DISCHARGE}_Class_{class_detail['class_name']}\",\n            per_class_metrics_to_log, # Use the correctly mapped dictionary\n            \"Multiclass_Cal_ClassDetail\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG\n        )\n\n    print(\"\\nCalibrated Classification Report:\\n\", classification_report(y_test_disc_enc, y_pred_labels_discharge_calibrated, target_names=le_discharge.classes_.astype(str), zero_division=0, digits=4))\n\n\n# =============================================================================\n# 5. MODEL SAVING\n# =============================================================================\nprint(\"\\n\" + \"=\"*80)\nprint(\"--- Saving Models and Artifacts ---\")\nprint(\"=\"*80)\n\n# Bundle all necessary objects for deployment into a single dictionary.\ninpatient_models_bundle = {\n    'los_model': los_model,\n    'readmission_model': calibrated_readmission_model,\n    'discharge_model': calibrated_discharge_model,\n    'feature_columns_los': best_los_features,\n    'feature_columns_readmission': best_readmission_features,\n    'feature_columns_discharge': best_discharge_features,\n    'le_discharge': le_discharge,\n    'model_run_id': MODEL_RUN_ID,\n    'fast_mode': FAST_MODE,\n    'excluded_feature_prefixes': EXCLUDE_FEATURE_PREFIXES\n}\n\n# Create a descriptive file name for the bundle.\nBUNDLE_SUFFIX = \"fast\" if FAST_MODE else \"fs\"\nEXCLUSION_FILE_TAG = f\"_excl_{'-'.join([p.strip('_').lower() for p in EXCLUDE_FEATURE_PREFIXES])}\" if EXCLUDE_FEATURE_PREFIXES else \"\"\nBUNDLE_FILE_NAME = f'inpatient_models_bundle_{MODEL_SOURCE_TAG}_{MODEL_YEAR_TAG}_{BUNDLE_SUFFIX}{EXCLUSION_FILE_TAG}.pkl'\n\n# Save the bundle locally using pickle.\nwith open(BUNDLE_FILE_NAME, 'wb') as f:\n    pickle.dump(inpatient_models_bundle, f)\nprint(f\"Models bundled and saved locally to: {BUNDLE_FILE_NAME}\")\n\n# Upload the local bundle file to the specified Snowflake stage.\nput_result = session.file.put(BUNDLE_FILE_NAME, SNOWFLAKE_STAGE_NAME, overwrite=True)\nif put_result[0].status == 'UPLOADED':\n    print(f\"Model bundle successfully uploaded to Snowflake stage: {SNOWFLAKE_STAGE_NAME}\")\nelse:\n    print(f\"Error uploading model bundle. Status: {put_result[0].status}, Message: {put_result[0].message}\")\n\nfile_size_mb = os.path.getsize(BUNDLE_FILE_NAME) / (1024 * 1024)\nprint(f\"Saved local bundle file size: {file_size_mb:.2f} MB\")\n\nprint(f\"\\n✅ Script finished ({'FAST MODE' if FAST_MODE else 'FULL MODE'}).\")",
-   "execution_count": null,
-   "outputs": []
-  }
- ]
-}

 {
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3775908f-ca36-4846-8f38-5adca39217f2",
+   "metadata": {
+    "language": "python",
+    "name": "cell1"
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "End-to-End Inpatient Model Training and Evaluation Script\n",
+    "\n",
+    "This script performs the following operations for an inpatient dataset:\n",
+    "1.  Loads data from a Snowflake table.\n",
+    "2.  Performs data preprocessing, including one-hot encoding of categorical\n",
+    "    features and standardization of column names.\n",
+    "3.  Allows for the exclusion of specified feature groups (e.g., 'hcc_').\n",
+    "4.  Provides a \"FAST_MODE\" to skip computationally intensive feature selection\n",
+    "    for rapid testing.\n",
+    "5.  Trains, calibrates, and evaluates three distinct models:\n",
+    "    a. Length of Stay (Regression with XGBoost).\n",
+    "    b. Readmission (Binary Classification with Calibrated Logistic Regression).\n",
+    "    c. Discharge Location (Multiclass Classification with Calibrated Logistic\n",
+    "       Regression).\n",
+    "6.  Logs model performance metrics, feature importances, and feature frequency\n",
+    "    statistics to separate Snowflake tables.\n",
+    "7.  Saves the trained models, feature lists, and encoders into a single\n",
+    "    pickle bundle file, then uploads it to a Snowflake stage.\n",
+    "\"\"\"\n",
+    "\n",
+    "import os\n",
+    "import pickle\n",
+    "import uuid\n",
+    "from datetime import datetime\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import xgboost as xgb\n",
+    "from sklearn.base import clone\n",
+    "from sklearn.calibration import CalibratedClassifierCV\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import (\n",
+    "    accuracy_score,\n",
+    "    average_precision_score,\n",
+    "    brier_score_loss,\n",
+    "    classification_report,\n",
+    "    log_loss,\n",
+    "    mean_absolute_error,\n",
+    "    mean_squared_error,\n",
+    "    r2_score,\n",
+    "    roc_auc_score,\n",
+    ")\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from snowflake.snowpark.context import get_active_session\n",
+    "\n",
+    "# =============================================================================\n",
+    "# 0. CONFIGURATION\n",
+    "# =============================================================================\n",
+    "# --- Snowflake Environment Settings ---\n",
+    "SNOWFLAKE_DATABASE = \"medicare_lds_five_multi_year\"\n",
+    "SNOWFLAKE_SCHEMA = \"BENCHMARKS\"\n",
+    "\n",
+    "# --- Input and Output Table/Stage Names ---\n",
+    "INPUT_TABLE = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.BENCHMARKS_INPATIENT_INPUT\"\n",
+    "METRICS_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.MODEL_EVAL_METRICS_INPATIENT\"\n",
+    "FEATURE_FREQ_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.FEATURE_FREQUENCY_STATS_INPATIENT\"\n",
+    "FEATURE_IMPORTANCE_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.MODEL_FEATURE_IMPORTANCE_INPATIENT\"\n",
+    "SNOWFLAKE_STAGE_NAME = f\"@{SNOWFLAKE_SCHEMA}.BENCHMARK_STAGE\"\n",
+    "\n",
+    "# --- Model Run Metadata ---\n",
+    "# A unique ID for this entire script run.\n",
+    "MODEL_RUN_ID = str(uuid.uuid4())\n",
+    "# Tags to identify the source and year of the data used for training.\n",
+    "MODEL_SOURCE_TAG = \"medicare_lds\"\n",
+    "MODEL_YEAR_TAG = \"2023\"\n",
+    "\n",
+    "# --- Feature Exclusion Switch ---\n",
+    "# Define a list of feature prefixes to exclude from the model.\n",
+    "# For example, to exclude all HCC features, use [\"hcc_\"]. Set to [] for no exclusions.\n",
+    "EXCLUDE_FEATURE_PREFIXES = [\"hcc_\"]\n",
+    "\n",
+    "# --- Development Mode Switch ---\n",
+    "# If True, the script skips the computationally expensive feature selection step.\n",
+    "# This is useful for quick runs to test the script's functionality.\n",
+    "# Set to False for a full production run to find the optimal feature set.\n",
+    "FAST_MODE = False\n",
+    "\n",
+    "# =============================================================================\n",
+    "# 1. SETUP: SNOWFLAKE SESSION & SCRIPT INITIALIZATION\n",
+    "# =============================================================================\n",
+    "session = get_active_session()\n",
+    "print(f\"Active session created. Model Run ID: {MODEL_RUN_ID}\")\n",
+    "\n",
+    "if FAST_MODE:\n",
+    "    print(\"\\n\" + \"=\" * 50)\n",
+    "    print(\"🚀 FAST MODE IS ENABLED 🚀\")\n",
+    "    print(\"Feature selection will be skipped for all models.\")\n",
+    "    print(\"=\" * 50 + \"\\n\")\n",
+    "\n",
+    "\n",
+    "# =============================================================================\n",
+    "# 2. DATA LOADING & PREPARATION\n",
+    "# =============================================================================\n",
+    "first_month = f\"{MODEL_YEAR_TAG}01\"\n",
+    "\n",
+    "print(\"Loading and preparing data from Snowflake...\")\n",
+    "query = f\"\"\"\n",
+    "SELECT *\n",
+    "FROM {INPUT_TABLE}\n",
+    "WHERE YEAR_NBR = {MODEL_YEAR_TAG}\n",
+    "AND FIRST_MONTH = {first_month}\n",
+    "\"\"\"\n",
+    "df_pd = session.sql(query).to_pandas()\n",
+    "\n",
+    "# Standardize all column names to lowercase for consistency.\n",
+    "df_pd.columns = df_pd.columns.str.lower()\n",
+    "print(\"Standardized DataFrame column names to lowercase.\")\n",
+    "\n",
+    "# One-hot encode specified categorical variables.\n",
+    "categorical_cols = ['state', 'race', 'sex', 'ms_drg_code', 'ccsr_cat']\n",
+    "df_pd_encoded = pd.get_dummies(df_pd, columns=[col for col in categorical_cols if col in df_pd.columns])\n",
+    "\n",
+    "# Define all potential feature groups.\n",
+    "condition_columns = [col for col in df_pd_encoded.columns if col.startswith(('cond_', 'cms_', 'hcc_'))]\n",
+    "other_columns = ['age_at_admit']\n",
+    "dummy_prefixes = tuple(f'{col}_' for col in categorical_cols)\n",
+    "dummy_columns = [col for col in df_pd_encoded.columns if col.startswith(dummy_prefixes)]\n",
+    "\n",
+    "# Ensure 'age_at_admit' exists before including it.\n",
+    "if 'age_at_admit' not in df_pd_encoded.columns and 'age_at_admit' in other_columns:\n",
+    "    print(\"Warning: 'age_at_admit' not found in features. Removing it.\")\n",
+    "    other_columns.remove('age_at_admit')\n",
+    "\n",
+    "# Combine all potential features into a single master list.\n",
+    "all_possible_features = other_columns + condition_columns + dummy_columns\n",
+    "\n",
+    "# Filter out features based on the exclusion configuration.\n",
+    "print(f\"Excluding feature prefixes: {EXCLUDE_FEATURE_PREFIXES}\")\n",
+    "if EXCLUDE_FEATURE_PREFIXES:\n",
+    "    initial_feature_count = len(all_possible_features)\n",
+    "    # A feature is kept if it does NOT start with any of the excluded prefixes.\n",
+    "    features_to_keep = [\n",
+    "        f for f in all_possible_features\n",
+    "        if not any(f.startswith(prefix) for prefix in EXCLUDE_FEATURE_PREFIXES)\n",
+    "    ]\n",
+    "    print(f\"Filtered features: Kept {len(features_to_keep)} out of {initial_feature_count} potential features.\")\n",
+    "else:\n",
+    "    features_to_keep = all_possible_features\n",
+    "    print(\"No feature prefixes specified for exclusion. Using all defined features.\")\n",
+    "\n",
+    "# The final list of features to be used for training.\n",
+    "X_columns = [col for col in features_to_keep if col in df_pd_encoded.columns]\n",
+    "X = df_pd_encoded[X_columns]\n",
+    "\n",
+    "print(f\"Data loaded. Shape of final feature matrix X: {X.shape}\")\n",
+    "print(f\"Number of features after exclusion: {len(X_columns)}\")\n",
+    "\n",
+    "\n",
+    "def create_feature_frequency_table_if_not_exists(session, table_name):\n",
+    "    \"\"\"Ensures the feature frequency statistics table exists in Snowflake.\"\"\"\n",
+    "    session.sql(f\"\"\"\n",
+    "      CREATE TABLE IF NOT EXISTS {table_name} (\n",
+    "        MODEL_RUN_ID          STRING,\n",
+    "        FEATURE_NAME          STRING,\n",
+    "        POSITIVE_COUNT        NUMBER,\n",
+    "        TOTAL_ROWS            NUMBER,\n",
+    "        POSITIVE_RATE_PERCENT FLOAT,\n",
+    "        EVAL_TS               TIMESTAMP_NTZ\n",
+    "      );\n",
+    "    \"\"\").collect()\n",
+    "    print(f\"Ensured feature frequency table {table_name} exists.\")\n",
+    "\n",
+    "# --- Analyze and log feature sparsity ---\n",
+    "create_feature_frequency_table_if_not_exists(session, FEATURE_FREQ_TABLE_NAME)\n",
+    "print(\"\\n--- Analysis: Positive Feature Rates (Sparsity Check on Training Data) ---\")\n",
+    "total_rows = len(X)\n",
+    "positive_counts = (X > 0).sum()\n",
+    "positive_rates = (positive_counts / total_rows) * 100\n",
+    "positive_rate_summary = pd.DataFrame({\n",
+    "    'feature': X.columns,\n",
+    "    'positive_count': positive_counts,\n",
+    "    'total_rows': total_rows,\n",
+    "    'positive_rate_percent': positive_rates\n",
+    "}).sort_values(by='positive_rate_percent', ascending=False).reset_index(drop=True)\n",
+    "\n",
+    "print(\"Positive (non-zero) rates for all features in the final training input (X), sorted descending:\")\n",
+    "with pd.option_context('display.max_rows', 20, 'display.max_columns', None, 'display.width', 120):\n",
+    "    print(positive_rate_summary)\n",
+    "\n",
+    "print(f\"\\nSaving feature frequency statistics to {FEATURE_FREQ_TABLE_NAME}...\")\n",
+    "df_to_save = positive_rate_summary.copy()\n",
+    "df_to_save['MODEL_RUN_ID'] = MODEL_RUN_ID\n",
+    "df_to_save['EVAL_TS'] = datetime.utcnow()\n",
+    "df_to_save.rename(columns={\n",
+    "    'feature': 'FEATURE_NAME', 'positive_count': 'POSITIVE_COUNT',\n",
+    "    'total_rows': 'TOTAL_ROWS', 'positive_rate_percent': 'POSITIVE_RATE_PERCENT'\n",
+    "}, inplace=True)\n",
+    "final_column_order = ['MODEL_RUN_ID', 'FEATURE_NAME', 'POSITIVE_COUNT', 'TOTAL_ROWS', 'POSITIVE_RATE_PERCENT', 'EVAL_TS']\n",
+    "df_to_save = df_to_save[final_column_order]\n",
+    "session.create_dataframe(df_to_save).write.mode(\"append\").save_as_table(FEATURE_FREQ_TABLE_NAME)\n",
+    "print(\"Successfully saved feature frequency statistics to Snowflake.\")\n",
+    "\n",
+    "# =============================================================================\n",
+    "# 3. UTILITY FUNCTIONS: METRICS, LOGGING, AND FEATURE SELECTION\n",
+    "# =============================================================================\n",
+    "\n",
+    "def calculate_regression_metrics(y_true, y_pred):\n",
+    "    \"\"\"Calculates a set of standard regression metrics.\"\"\"\n",
+    "    y_true_np, y_pred_np = np.array(y_true), np.array(y_pred)\n",
+    "    sum_y_true, mean_y_true = np.sum(y_true_np), np.mean(y_true_np)\n",
+    "    pred_ratio = np.sum(y_pred_np) / sum_y_true if sum_y_true != 0 else np.nan\n",
+    "    mae_percent = (mean_absolute_error(y_true_np, y_pred_np) / mean_y_true) * 100 if mean_y_true != 0 else np.nan\n",
+    "    return {\n",
+    "        'R2': r2_score(y_true_np, y_pred_np), 'MAE': mean_absolute_error(y_true_np, y_pred_np),\n",
+    "        'MSE': mean_squared_error(y_true_np, y_pred_np), 'PRED_RATIO': pred_ratio, 'MAE_PERCENT': mae_percent,\n",
+    "        'AVG_Y_PRED': np.mean(y_pred_np), 'AVG_Y_TRUE': mean_y_true\n",
+    "    }\n",
+    "\n",
+    "def calculate_binary_classification_proba_metrics(y_true, y_pred_proba):\n",
+    "    \"\"\"Calculates a set of standard binary classification metrics from probabilities.\"\"\"\n",
+    "    y_true_np, y_pred_proba_np = np.array(y_true), np.array(y_pred_proba)\n",
+    "    is_multiclass = len(np.unique(y_true_np)) > 1\n",
+    "    auc_roc = roc_auc_score(y_true_np, y_pred_proba_np) if is_multiclass else np.nan\n",
+    "    auc_pr = average_precision_score(y_true_np, y_pred_proba_np) if is_multiclass else np.nan\n",
+    "    return {\n",
+    "        'AUC_ROC': auc_roc, 'AUC_PR': auc_pr, 'LOG_LOSS': log_loss(y_true_np, y_pred_proba_np),\n",
+    "        'BRIER_SCORE': brier_score_loss(y_true_np, y_pred_proba_np),\n",
+    "        'AVG_Y_PRED_PROBA': np.mean(y_pred_proba_np), 'AVG_Y_TRUE': np.mean(y_true_np)\n",
+    "    }\n",
+    "\n",
+    "def calculate_multiclass_classification_metrics(y_true_encoded, y_pred_labels, y_pred_proba, le_classes):\n",
+    "    \"\"\"Calculates overall and per-class metrics for multiclass classification.\"\"\"\n",
+    "    num_samples, num_classes = len(y_true_encoded), len(le_classes)\n",
+    "    metrics = {\n",
+    "        'ACCURACY': accuracy_score(y_true_encoded, y_pred_labels),\n",
+    "        'LOG_LOSS': log_loss(y_true_encoded, y_pred_proba, labels=np.arange(num_classes))\n",
+    "    }\n",
+    "    per_class_details, all_brier_scores = [], []\n",
+    "    if num_samples > 0 and num_classes > 0:\n",
+    "        for i in range(num_classes):\n",
+    "            class_name = le_classes[i]\n",
+    "            true_class_binary = (y_true_encoded == i).astype(int)\n",
+    "            pred_proba_for_class = y_pred_proba[:, i]\n",
+    "            avg_pred_proba_class = np.mean(pred_proba_for_class)\n",
+    "            true_proportion_class = np.mean(true_class_binary)\n",
+    "            proba_ratio_class = avg_pred_proba_class / true_proportion_class if true_proportion_class > 0 else np.nan\n",
+    "            brier_score_class = brier_score_loss(true_class_binary, pred_proba_for_class) if len(np.unique(true_class_binary)) > 1 else np.nan\n",
+    "            all_brier_scores.append(brier_score_class)\n",
+    "            per_class_details.append({\n",
+    "                \"class_name\": class_name,\n",
+    "                \"avg_pred_proba\": avg_pred_proba_class,\n",
+    "                \"true_proportion\": true_proportion_class,\n",
+    "                \"proba_ratio\": proba_ratio_class,\n",
+    "                \"brier_score\": brier_score_class\n",
+    "            })\n",
+    "    metrics['per_class_details'] = per_class_details\n",
+    "    valid_brier_scores = [s for s in all_brier_scores if not np.isnan(s)]\n",
+    "    metrics['BRIER_SCORE_MACRO_AVG'] = np.mean(valid_brier_scores) if valid_brier_scores else np.nan\n",
+    "    return metrics\n",
+    "\n",
+    "def create_metrics_table_if_not_exists(session, table_name):\n",
+    "    \"\"\"Ensures the main model metrics table exists in Snowflake.\"\"\"\n",
+    "    session.sql(f\"\"\"\n",
+    "      CREATE TABLE IF NOT EXISTS {table_name} (\n",
+    "        MODEL_RUN_ID STRING, MODEL_NAME STRING, TARGET_NAME STRING, R2 FLOAT, MAE FLOAT, MSE FLOAT,\n",
+    "        PRED_RATIO FLOAT, MAE_PERCENT FLOAT, AUC_ROC FLOAT, AUC_PR FLOAT, LOG_LOSS FLOAT,\n",
+    "        BRIER_SCORE FLOAT, ACCURACY FLOAT, AVG_Y_PRED FLOAT, AVG_Y_TRUE FLOAT, MODEL_SOURCE STRING,\n",
+    "        MODEL_TYPE STRING, MODEL_YEAR STRING, EVAL_TS TIMESTAMP_NTZ\n",
+    "      );\n",
+    "    \"\"\").collect()\n",
+    "    print(f\"Ensured metrics table {table_name} exists.\")\n",
+    "\n",
+    "def create_feature_importance_table_if_not_exists(session, table_name):\n",
+    "    \"\"\"Ensures the feature importance table exists in Snowflake.\"\"\"\n",
+    "    session.sql(f\"\"\"\n",
+    "      CREATE TABLE IF NOT EXISTS {table_name} (\n",
+    "        MODEL_RUN_ID      STRING,\n",
+    "        MODEL_NAME        STRING,\n",
+    "        TARGET_NAME       STRING,\n",
+    "        FEATURE_NAME      STRING,\n",
+    "        IMPORTANCE_VALUE  FLOAT,\n",
+    "        IMPORTANCE_RANK   NUMBER,\n",
+    "        EVAL_TS           TIMESTAMP_NTZ\n",
+    "      );\n",
+    "    \"\"\").collect()\n",
+    "    print(f\"Ensured feature importance table {table_name} exists.\")\n",
+    "\n",
+    "def log_model_metrics_to_snowflake(session, model_run_id, model_name, target_name, metrics_dict, model_type, metrics_table, model_source_tag, model_year_tag):\n",
+    "    \"\"\"Constructs a payload and logs model metrics to a Snowflake table.\"\"\"\n",
+    "    avg_y_pred = metrics_dict.get('AVG_Y_PRED', metrics_dict.get('AVG_Y_PRED_PROBA'))\n",
+    "    full_metrics_payload = {\n",
+    "        \"MODEL_RUN_ID\": model_run_id, \"MODEL_NAME\": model_name, \"TARGET_NAME\": target_name,\n",
+    "        \"R2\": metrics_dict.get('R2'), \"MAE\": metrics_dict.get('MAE'), \"MSE\": metrics_dict.get('MSE'),\n",
+    "        \"PRED_RATIO\": metrics_dict.get('PRED_RATIO'), \"MAE_PERCENT\": metrics_dict.get('MAE_PERCENT'),\n",
+    "        \"AUC_ROC\": metrics_dict.get('AUC_ROC'), \"AUC_PR\": metrics_dict.get('AUC_PR'),\n",
+    "        \"LOG_LOSS\": metrics_dict.get('LOG_LOSS'), \"BRIER_SCORE\": metrics_dict.get('BRIER_SCORE'),\n",
+    "        \"ACCURACY\": metrics_dict.get('ACCURACY'), \"AVG_Y_PRED\": avg_y_pred,\n",
+    "        \"AVG_Y_TRUE\": metrics_dict.get('AVG_Y_TRUE'), \"MODEL_SOURCE\": model_source_tag,\n",
+    "        \"MODEL_TYPE\": model_type, \"MODEL_YEAR\": model_year_tag, \"EVAL_TS\": datetime.utcnow()\n",
+    "    }\n",
+    "    # Round floats and handle NaNs for database compatibility\n",
+    "    for key, value in full_metrics_payload.items():\n",
+    "        if isinstance(value, (float, np.floating)):\n",
+    "            full_metrics_payload[key] = round(value, 6) if not np.isnan(value) else None\n",
+    "    \n",
+    "    dfm = pd.DataFrame([full_metrics_payload])\n",
+    "    ordered_cols = [\n",
+    "        \"MODEL_RUN_ID\", \"MODEL_NAME\", \"TARGET_NAME\", \"R2\", \"MAE\", \"MSE\",\n",
+    "        \"PRED_RATIO\", \"MAE_PERCENT\", \"AUC_ROC\", \"AUC_PR\", \"LOG_LOSS\",\n",
+    "        \"BRIER_SCORE\", \"ACCURACY\", \"AVG_Y_PRED\", \"AVG_Y_TRUE\", \"MODEL_SOURCE\",\n",
+    "        \"MODEL_TYPE\", \"MODEL_YEAR\", \"EVAL_TS\"\n",
+    "    ]\n",
+    "    dfm = dfm[ordered_cols]\n",
+    "    session.create_dataframe(dfm).write.mode(\"append\").save_as_table(metrics_table)\n",
+    "    print(f\"Logged metrics for {model_name} - {target_name} to {metrics_table}.\")\n",
+    "\n",
+    "def log_feature_importances_to_snowflake(session, model, feature_names, model_run_id, model_name, target_name, table_name):\n",
+    "    \"\"\"Extracts, ranks, and logs feature importances to a Snowflake table.\"\"\"\n",
+    "    if hasattr(model, 'feature_importances_'):\n",
+    "        importances = model.feature_importances_\n",
+    "    elif hasattr(model, 'coef_'):\n",
+    "        # For multi-class logistic regression, average the absolute coefficients across classes\n",
+    "        importances = np.mean(np.abs(model.coef_), axis=0) if model.coef_.ndim > 1 else np.abs(model.coef_[0])\n",
+    "    else:\n",
+    "        print(f\"Warning: Model type for '{model_name}' does not have 'feature_importances_' or 'coef_'. Skipping importance logging.\")\n",
+    "        return\n",
+    "\n",
+    "    importance_df = pd.DataFrame({'FEATURE_NAME': feature_names, 'IMPORTANCE_VALUE': importances})\n",
+    "    importance_df = importance_df.sort_values(by='IMPORTANCE_VALUE', ascending=False).reset_index(drop=True)\n",
+    "    importance_df['IMPORTANCE_RANK'] = importance_df.index + 1\n",
+    "    importance_df['MODEL_RUN_ID'] = model_run_id\n",
+    "    importance_df['MODEL_NAME'] = model_name\n",
+    "    importance_df['TARGET_NAME'] = target_name\n",
+    "    importance_df['EVAL_TS'] = datetime.utcnow()\n",
+    "\n",
+    "    final_cols = ['MODEL_RUN_ID', 'MODEL_NAME', 'TARGET_NAME', 'FEATURE_NAME', 'IMPORTANCE_VALUE', 'IMPORTANCE_RANK', 'EVAL_TS']\n",
+    "    importance_df = importance_df[final_cols]\n",
+    "    \n",
+    "    session.create_dataframe(importance_df).write.mode(\"append\").save_as_table(table_name)\n",
+    "    print(f\"Logged {len(importance_df)} feature importances for {model_name} - {target_name} to {table_name}.\")\n",
+    "\n",
+    "def find_best_feature_subset(model, X_train, y_train, X_val, y_val, scoring_func, higher_is_better, model_name, feature_counts_to_test=None):\n",
+    "    \"\"\"\n",
+    "    Performs recursive feature elimination to find the most parsimonious feature set.\n",
+    "\n",
+    "    This function first ranks all features by importance, then iteratively tests smaller\n",
+    "    subsets of the top features. It selects the smallest feature set that performs\n",
+    "    within a small tolerance of the absolute best-performing set.\n",
+    "    \"\"\"\n",
+    "    print(f\"\\n--- [{model_name}] Starting feature selection process ---\")\n",
+    "    \n",
+    "    # Step 1: Rank all features by importance using the full training set.\n",
+    "    print(\"Step 1: Ranking all features by importance...\")\n",
+    "    ranker_model = clone(model)\n",
+    "    ranker_model.fit(X_train, y_train)\n",
+    "\n",
+    "    if hasattr(ranker_model, 'feature_importances_'):\n",
+    "        importances = ranker_model.feature_importances_\n",
+    "    elif hasattr(ranker_model, 'coef_'):\n",
+    "        importances = np.mean(np.abs(ranker_model.coef_), axis=0) if ranker_model.coef_.ndim > 1 else np.abs(ranker_model.coef_[0])\n",
+    "    else:\n",
+    "        raise TypeError(\"Model type not supported for feature importance extraction.\")\n",
+    "\n",
+    "    feature_importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': importances}).sort_values('importance', ascending=False)\n",
+    "    ranked_features = feature_importance_df['feature'].tolist()\n",
+    "    \n",
+    "    # Step 2: Evaluate model performance on different feature subset sizes.\n",
+    "    print(\"Step 2: Evaluating model performance on different feature subset sizes...\")\n",
+    "    if feature_counts_to_test is None:\n",
+    "        n_total_features = len(ranked_features)\n",
+    "        # Define a dynamic set of feature counts to test.\n",
+    "        fractions = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.15, 0.1, 0.05]\n",
+    "        fractional_counts = [int(n_total_features * f) for f in fractions]\n",
+    "        absolute_counts = [250, 200, 150, 100, 75, 50, 25, 10]\n",
+    "        counts = set([n_total_features] + fractional_counts + absolute_counts)\n",
+    "        feature_counts_to_test = sorted([n for n in counts if 0 < n <= n_total_features], reverse=True)\n",
+    "        print(f\"Generated feature counts to test: {feature_counts_to_test}\")\n",
+    "        \n",
+    "    val_scores = []\n",
+    "    for n_features in feature_counts_to_test:\n",
+    "        subset_features = ranked_features[:n_features]\n",
+    "        loop_model = clone(model)\n",
+    "        loop_model.fit(X_train[subset_features], y_train)\n",
+    "        \n",
+    "        # Predict based on model type (regressor vs. classifier).\n",
+    "        if hasattr(loop_model, 'predict_proba'):\n",
+    "            val_preds = loop_model.predict_proba(X_val[subset_features])\n",
+    "            # For binary classification, use the probability of the positive class.\n",
+    "            if len(np.unique(y_train)) == 2 and val_preds.shape[1] == 2:\n",
+    "                 val_preds = val_preds[:, 1]\n",
+    "        else:\n",
+    "            val_preds = loop_model.predict(X_val[subset_features])\n",
+    "\n",
+    "        val_score = scoring_func(y_val, val_preds)\n",
+    "        val_scores.append(val_score)\n",
+    "        print(f\"  - Tested with {n_features:4} features: Validation Score = {val_score:.4f}\")\n",
+    "\n",
+    "    # Step 3: Select the most parsimonious model within a tolerance of the best score.\n",
+    "    TOLERANCE = 0.01  # 1% tolerance\n",
+    "    if higher_is_better:\n",
+    "        best_val_score = np.max(val_scores)\n",
+    "        score_threshold = best_val_score * (1 - TOLERANCE)\n",
+    "        candidate_indices = np.where(val_scores >= score_threshold)[0]\n",
+    "    else: # lower is better (e.g., for MSE or LogLoss)\n",
+    "        best_val_score = np.min(val_scores)\n",
+    "        score_threshold = best_val_score * (1 + TOLERANCE)\n",
+    "        candidate_indices = np.where(val_scores <= score_threshold)[0]\n",
+    "    \n",
+    "    # The last index in the candidates corresponds to the smallest model (most parsimonious).\n",
+    "    best_parsimonious_idx = candidate_indices[-1]\n",
+    "    best_score_idx = np.argmax(val_scores) if higher_is_better else np.argmin(val_scores)\n",
+    "    optimal_n_features = feature_counts_to_test[best_parsimonious_idx]\n",
+    "    best_features = ranked_features[:optimal_n_features]\n",
+    "    \n",
+    "    print(f\"\\nStep 3: Found optimal feature set using the parsimony principle.\")\n",
+    "    print(f\"  - Absolute best validation score: {val_scores[best_score_idx]:.4f} with {feature_counts_to_test[best_score_idx]} features.\")\n",
+    "    print(f\"  - Score threshold (with {TOLERANCE*100}% tolerance): {score_threshold:.4f}\")\n",
+    "    print(f\"  - Chosen parsimonious model: {val_scores[best_parsimonious_idx]:.4f} with {optimal_n_features} features.\")\n",
+    "    \n",
+    "    # Plot the results for visualization.\n",
+    "    plt.figure(figsize=(12, 7))\n",
+    "    plt.plot(feature_counts_to_test, val_scores, 'o-', label=f'Validation Set Score ({scoring_func.__name__})')\n",
+    "    plt.axvline(x=feature_counts_to_test[best_score_idx], color='grey', linestyle=':', label=f'Absolute Best Score ({feature_counts_to_test[best_score_idx]} features)')\n",
+    "    plt.axvline(x=optimal_n_features, color='r', linestyle='--', label=f'Chosen Parsimonious Model ({optimal_n_features} features)')\n",
+    "    plt.title(f'[{model_name}] Performance vs. Number of Features')\n",
+    "    plt.xlabel('Number of Top Features Used')\n",
+    "    plt.ylabel('Score')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True, which='both', linestyle='--', linewidth=0.5)\n",
+    "    plt.gca().invert_xaxis()\n",
+    "    plt.show()\n",
+    "    \n",
+    "    return best_features\n",
+    "\n",
+    "# Ensure logging tables exist before training begins.\n",
+    "create_metrics_table_if_not_exists(session, METRICS_TABLE_NAME)\n",
+    "create_feature_importance_table_if_not_exists(session, FEATURE_IMPORTANCE_TABLE_NAME)\n",
+    "\n",
+    "# =============================================================================\n",
+    "# 4. MODEL TRAINING & EVALUATION\n",
+    "# =============================================================================\n",
+    "\n",
+    "# --- Define dynamic model name suffixes based on configuration ---\n",
+    "MODEL_NAME_SUFFIX = \"Fast\" if FAST_MODE else \"FeatureSelected\"\n",
+    "if EXCLUDE_FEATURE_PREFIXES:\n",
+    "    cleaned_prefixes = [p.strip('_') for p in EXCLUDE_FEATURE_PREFIXES]\n",
+    "    exclusion_tag = \"-\".join(cleaned_prefixes)\n",
+    "    EXCLUSION_SUFFIX = f\"_Excl-{exclusion_tag}\"\n",
+    "else:\n",
+    "    EXCLUSION_SUFFIX = \"\"\n",
+    "\n",
+    "# --- 4.1 Model 1: Predicting Length of Stay (Regression) ---\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"--- Training Model 1: Length of Stay (LOS) with Gamma Objective ---\")\n",
+    "print(\"=\"*80)\n",
+    "TARGET_LOS = 'length_of_stay'\n",
+    "los_model, best_los_features = None, None\n",
+    "\n",
+    "if TARGET_LOS not in df_pd.columns:\n",
+    "    print(f\"Error: Target column '{TARGET_LOS}' not found. Skipping LOS model.\")\n",
+    "else:\n",
+    "    y_los = df_pd[TARGET_LOS].astype(float)\n",
+    "    # The Gamma objective requires positive target values.\n",
+    "    if (y_los <= 0).any():\n",
+    "        print(f\"Warning: Found {(y_los <= 0).sum()} non-positive values in '{TARGET_LOS}'. Clamping to a small positive number.\")\n",
+    "        y_los = y_los.clip(lower=0.001)\n",
+    "\n",
+    "    X_train_los, X_test_los, y_train_los, y_test_los = train_test_split(X, y_los, test_size=0.2, random_state=42)\n",
+    "    X_train_fs_los, X_val_fs_los, y_train_fs_los, y_val_fs_los = train_test_split(X_train_los, y_train_los, test_size=0.25, random_state=42)\n",
+    "\n",
+    "    base_los_model = xgb.XGBRegressor(objective='reg:gamma', random_state=42, n_estimators=1000, learning_rate=0.05, max_depth=7, subsample=0.8, colsample_bytree=0.8, eval_metric='gamma-deviance')\n",
+    "    \n",
+    "    # Determine the feature set to use.\n",
+    "    if FAST_MODE:\n",
+    "        print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
+    "        best_los_features = X.columns.tolist()\n",
+    "    else:\n",
+    "        best_los_features = find_best_feature_subset(\n",
+    "            model=base_los_model,\n",
+    "            X_train=X_train_fs_los, y_train=y_train_fs_los,\n",
+    "            X_val=X_val_fs_los, y_val=y_val_fs_los,\n",
+    "            scoring_func=mean_squared_error,\n",
+    "            higher_is_better=False,\n",
+    "            model_name=\"Length of Stay (XGBoost)\"\n",
+    "        )\n",
+    "\n",
+    "    print(f\"\\nTraining final LOS model using {len(best_los_features)} features...\")\n",
+    "    los_model = clone(base_los_model)\n",
+    "    los_model.set_params(early_stopping_rounds=50)\n",
+    "    eval_set = [(X_val_fs_los[best_los_features], y_val_fs_los)]\n",
+    "    los_model.fit(X_train_los[best_los_features], y_train_los, eval_set=eval_set, verbose=False)\n",
+    "    print(f\"Optimal number of trees found via early stopping: {los_model.best_iteration}\")\n",
+    "\n",
+    "    y_pred_los = los_model.predict(X_test_los[best_los_features])\n",
+    "\n",
+    "    print(\"\\nLOS Model - Test Set Evaluation:\")\n",
+    "    los_metrics = calculate_regression_metrics(y_test_los, y_pred_los)\n",
+    "    for k, v in los_metrics.items(): print(f\"  {k}: {v:.4f}\")\n",
+    "\n",
+    "    # Log metrics and feature importances.\n",
+    "    los_model_name = f\"Inpatient_LOS_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
+    "    log_model_metrics_to_snowflake(session, MODEL_RUN_ID, los_model_name, TARGET_LOS, los_metrics, \"Regression\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
+    "    log_feature_importances_to_snowflake(session, los_model, best_los_features, MODEL_RUN_ID, los_model_name, TARGET_LOS, FEATURE_IMPORTANCE_TABLE_NAME)\n",
+    "\n",
+    "# --- 4.2 Model 2: Predicting Readmission (Binary Classification) ---\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"--- Training Model 2: Calibrated Readmission Probability ---\")\n",
+    "print(\"=\"*80)\n",
+    "TARGET_READMISSION = 'readmission_numerator'\n",
+    "DENOMINATOR_COL = 'readmission_denominator'\n",
+    "calibrated_readmission_model, best_readmission_features = None, None\n",
+    "\n",
+    "if TARGET_READMISSION not in df_pd.columns or DENOMINATOR_COL not in df_pd.columns:\n",
+    "    print(f\"Error: Required columns '{TARGET_READMISSION}' or '{DENOMINATOR_COL}' not found. Skipping Readmission model.\")\n",
+    "else:\n",
+    "    # Filter data to only include encounters eligible for readmission.\n",
+    "    readmission_filter_mask = df_pd[DENOMINATOR_COL] == 1\n",
+    "    if readmission_filter_mask.sum() == 0:\n",
+    "        print(\"Error: The filter condition resulted in zero encounters. Skipping Readmission model.\")\n",
+    "    else:\n",
+    "        print(f\"Filtering data for Readmission Model where '{DENOMINATOR_COL}' = 1 ({readmission_filter_mask.sum()} rows).\")\n",
+    "        X_readmission = X.loc[readmission_filter_mask].reset_index(drop=True)\n",
+    "        y_readmission = df_pd.loc[readmission_filter_mask, TARGET_READMISSION].reset_index(drop=True)\n",
+    "        \n",
+    "        # Split data: 60% base train, 20% calibration, 20% test\n",
+    "        stratify_readmission = y_readmission if len(np.unique(y_readmission)) > 1 else None\n",
+    "        X_train_full, X_test_read, y_train_full, y_test_read = train_test_split(X_readmission, y_readmission, test_size=0.2, random_state=42, stratify=stratify_readmission)\n",
+    "        stratify_y_train_full = y_train_full if len(np.unique(y_train_full)) > 1 else None\n",
+    "        X_train_base, X_calib_read, y_train_base, y_calib_read = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=stratify_y_train_full)\n",
+    "        print(f\"Data split for readmission: Base train: {X_train_base.shape[0]}, Calibration: {X_calib_read.shape[0]}, Test: {X_test_read.shape[0]}\")\n",
+    "        \n",
+    "        base_readmit_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, solver='liblinear')\n",
+    "        \n",
+    "        # Determine the feature set to use.\n",
+    "        if FAST_MODE:\n",
+    "            print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
+    "            best_readmission_features = X_train_base.columns.tolist()\n",
+    "        else:\n",
+    "            best_readmission_features = find_best_feature_subset(\n",
+    "                model=base_readmit_model, X_train=X_train_base, y_train=y_train_base, X_val=X_calib_read, y_val=y_calib_read,\n",
+    "                scoring_func=roc_auc_score, higher_is_better=True, model_name=\"Readmission (Logistic Regression)\"\n",
+    "            )\n",
+    "\n",
+    "        print(f\"\\nTraining final Readmission model pipeline using {len(best_readmission_features)} features...\")\n",
+    "        base_model_for_calib = clone(base_readmit_model)\n",
+    "        base_model_for_calib.fit(X_train_base[best_readmission_features], y_train_base)\n",
+    "        \n",
+    "        # Log feature importances from the base (uncalibrated) model.\n",
+    "        uncal_read_model_name = f\"Inpatient_Readmission_Base_Uncalibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
+    "        log_feature_importances_to_snowflake(session, base_model_for_calib, best_readmission_features, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION, FEATURE_IMPORTANCE_TABLE_NAME)\n",
+    "        \n",
+    "        # Evaluate and log metrics for the uncalibrated model for comparison.\n",
+    "        y_pred_proba_uncal = base_model_for_calib.predict_proba(X_test_read[best_readmission_features])[:, 1]\n",
+    "        uncalibrated_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_uncal)\n",
+    "        log_model_metrics_to_snowflake(session, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION + \"_Probability\", uncalibrated_metrics, \"Binary_Uncalibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
+    "        \n",
+    "        # Calibrate the model on the held-out calibration set.\n",
+    "        calibrated_readmission_model = CalibratedClassifierCV(base_model_for_calib, method='isotonic', cv='prefit')\n",
+    "        calibrated_readmission_model.fit(X_calib_read[best_readmission_features], y_calib_read)\n",
+    "        y_pred_proba_cal = calibrated_readmission_model.predict_proba(X_test_read[best_readmission_features])[:, 1]\n",
+    "\n",
+    "        print(\"\\nCalibrated Readmission Model - Test Set Evaluation:\")\n",
+    "        calibrated_proba_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_cal)\n",
+    "        for k, v in calibrated_proba_metrics.items(): print(f\"  {k}: {v:.4f}\")\n",
+    "        \n",
+    "        cal_read_model_name = f\"Inpatient_Readmission_Calibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
+    "        log_model_metrics_to_snowflake(session, MODEL_RUN_ID, cal_read_model_name, TARGET_READMISSION + \"_Probability\", calibrated_proba_metrics, \"Binary_Calibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
+    "\n",
+    "# --- 4.3 Model 3: Predicting Discharge Location (Multiclass Classification) ---\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"--- Training Model 3: Calibrated Discharge Location ---\")\n",
+    "print(\"=\"*80)\n",
+    "TARGET_DISCHARGE = 'discharge_location'\n",
+    "calibrated_discharge_model, le_discharge, best_discharge_features = None, None, None\n",
+    "\n",
+    "if TARGET_DISCHARGE not in df_pd.columns:\n",
+    "    print(f\"Error: Target column '{TARGET_DISCHARGE}' not found. Skipping Discharge Location model.\")\n",
+    "else:\n",
+    "    le_discharge = LabelEncoder()\n",
+    "    y_discharge_encoded = le_discharge.fit_transform(df_pd[TARGET_DISCHARGE])\n",
+    "    num_classes_discharge = len(le_discharge.classes_)\n",
+    "    print(f\"Discharge Location: {num_classes_discharge} classes found: {le_discharge.classes_}\")\n",
+    "    \n",
+    "    # Split data: 60% base train, 20% calibration, 20% test\n",
+    "    stratify_discharge = y_discharge_encoded if num_classes_discharge > 1 else None\n",
+    "    X_train_full_disc, X_test_disc, y_train_full_disc_enc, y_test_disc_enc = train_test_split(X, y_discharge_encoded, test_size=0.2, random_state=42, stratify=stratify_discharge)\n",
+    "    X_train_base_disc, X_calib_disc, y_train_base_disc_enc, y_calib_disc_enc = train_test_split(X_train_full_disc, y_train_full_disc_enc, test_size=0.25, random_state=42, stratify=y_train_full_disc_enc if num_classes_discharge > 1 else None)\n",
+    "    print(f\"Data split for discharge: Base train: {X_train_base_disc.shape[0]}, Calibration: {X_calib_disc.shape[0]}, Test: {X_test_disc.shape[0]}\")\n",
+    "    \n",
+    "    base_discharge_model = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', multi_class='multinomial', class_weight='balanced')\n",
+    "    \n",
+    "    # Determine the feature set to use.\n",
+    "    if FAST_MODE:\n",
+    "        print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
+    "        best_discharge_features = X_train_base_disc.columns.tolist()\n",
+    "    else:\n",
+    "        best_discharge_features = find_best_feature_subset(\n",
+    "            model=base_discharge_model, X_train=X_train_base_disc, y_train=y_train_base_disc_enc, X_val=X_calib_disc, y_val=y_calib_disc_enc,\n",
+    "            scoring_func=log_loss, higher_is_better=False, model_name=\"Discharge Location (Multinomial Regression)\"\n",
+    "        )\n",
+    "\n",
+    "    print(f\"\\nTraining final Discharge Location model pipeline using {len(best_discharge_features)} features...\")\n",
+    "    base_model_for_calib_disc = clone(base_discharge_model)\n",
+    "    base_model_for_calib_disc.fit(X_train_base_disc[best_discharge_features], y_train_base_disc_enc)\n",
+    "    \n",
+    "    discharge_model_name = f\"Inpatient_Discharge_Cal_Overall_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
+    "    log_feature_importances_to_snowflake(session, base_model_for_calib_disc, best_discharge_features, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, FEATURE_IMPORTANCE_TABLE_NAME)\n",
+    "    \n",
+    "    # Calibrate the model. 'sigmoid' is used for one-vs-rest calibration, suitable for multiclass.\n",
+    "    calibrated_discharge_model = CalibratedClassifierCV(base_model_for_calib_disc, method='sigmoid', cv='prefit')\n",
+    "    calibrated_discharge_model.fit(X_calib_disc[best_discharge_features], y_calib_disc_enc)\n",
+    "    y_pred_proba_discharge_calibrated = calibrated_discharge_model.predict_proba(X_test_disc[best_discharge_features])\n",
+    "    y_pred_labels_discharge_calibrated = calibrated_discharge_model.predict(X_test_disc[best_discharge_features])\n",
+    "    \n",
+    "    print(\"\\nCalibrated Discharge Model - Test Set Evaluation:\")\n",
+    "    calibrated_disc_metrics = calculate_multiclass_classification_metrics(y_test_disc_enc, y_pred_labels_discharge_calibrated, y_pred_proba_discharge_calibrated, le_discharge.classes_)\n",
+    "    \n",
+    "    # Log the overall multiclass metrics.\n",
+    "    overall_cal_metrics_to_log = {k: v for k, v in calibrated_disc_metrics.items() if k != 'per_class_details'}\n",
+    "    overall_cal_metrics_to_log['BRIER_SCORE'] = calibrated_disc_metrics.get('BRIER_SCORE_MACRO_AVG')\n",
+    "    log_model_metrics_to_snowflake(session, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, overall_cal_metrics_to_log, \"Multiclass_Cal_Overall\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
+    "    \n",
+    "    # --- FIX: Log the per-class metrics by mapping keys correctly ---\n",
+    "    discharge_class_model_name = f\"Inpatient_Discharge_Cal_Class_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
+    "    for class_detail in calibrated_disc_metrics.get('per_class_details', []):\n",
+    "        # Create a new dict with keys the logging function expects.\n",
+    "        per_class_metrics_to_log = {\n",
+    "            'BRIER_SCORE': class_detail.get('brier_score'),\n",
+    "            'AVG_Y_PRED': class_detail.get('avg_pred_proba'),\n",
+    "            'AVG_Y_TRUE': class_detail.get('true_proportion'),\n",
+    "            'PRED_RATIO': class_detail.get('proba_ratio'),\n",
+    "        }\n",
+    "        log_model_metrics_to_snowflake(\n",
+    "            session, MODEL_RUN_ID, discharge_class_model_name,\n",
+    "            f\"{TARGET_DISCHARGE}_Class_{class_detail['class_name']}\",\n",
+    "            per_class_metrics_to_log, # Use the correctly mapped dictionary\n",
+    "            \"Multiclass_Cal_ClassDetail\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG\n",
+    "        )\n",
+    "\n",
+    "    print(\"\\nCalibrated Classification Report:\\n\", classification_report(y_test_disc_enc, y_pred_labels_discharge_calibrated, target_names=le_discharge.classes_.astype(str), zero_division=0, digits=4))\n",
+    "\n",
+    "\n",
+    "# =============================================================================\n",
+    "# 5. MODEL SAVING\n",
+    "# =============================================================================\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"--- Saving Models and Artifacts ---\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "# Bundle all necessary objects for deployment into a single dictionary.\n",
+    "inpatient_models_bundle = {\n",
+    "    'los_model': los_model,\n",
+    "    'readmission_model': calibrated_readmission_model,\n",
+    "    'discharge_model': calibrated_discharge_model,\n",
+    "    'feature_columns_los': best_los_features,\n",
+    "    'feature_columns_readmission': best_readmission_features,\n",
+    "    'feature_columns_discharge': best_discharge_features,\n",
+    "    'le_discharge': le_discharge,\n",
+    "    'model_run_id': MODEL_RUN_ID,\n",
+    "    'fast_mode': FAST_MODE,\n",
+    "    'excluded_feature_prefixes': EXCLUDE_FEATURE_PREFIXES\n",
+    "}\n",
+    "\n",
+    "# Create a descriptive file name for the bundle.\n",
+    "BUNDLE_SUFFIX = \"fast\" if FAST_MODE else \"fs\"\n",
+    "EXCLUSION_FILE_TAG = f\"_excl_{'-'.join([p.strip('_').lower() for p in EXCLUDE_FEATURE_PREFIXES])}\" if EXCLUDE_FEATURE_PREFIXES else \"\"\n",
+    "BUNDLE_FILE_NAME = f'inpatient_models_bundle_{MODEL_SOURCE_TAG}_{MODEL_YEAR_TAG}_{BUNDLE_SUFFIX}{EXCLUSION_FILE_TAG}.pkl'\n",
+    "\n",
+    "# Save the bundle locally using pickle.\n",
+    "with open(BUNDLE_FILE_NAME, 'wb') as f:\n",
+    "    pickle.dump(inpatient_models_bundle, f)\n",
+    "print(f\"Models bundled and saved locally to: {BUNDLE_FILE_NAME}\")\n",
+    "\n",
+    "# Upload the local bundle file to the specified Snowflake stage.\n",
+    "put_result = session.file.put(BUNDLE_FILE_NAME, SNOWFLAKE_STAGE_NAME, overwrite=True)\n",
+    "if put_result[0].status == 'UPLOADED':\n",
+    "    print(f\"Model bundle successfully uploaded to Snowflake stage: {SNOWFLAKE_STAGE_NAME}\")\n",
+    "else:\n",
+    "    print(f\"Error uploading model bundle. Status: {put_result[0].status}, Message: {put_result[0].message}\")\n",
+    "\n",
+    "file_size_mb = os.path.getsize(BUNDLE_FILE_NAME) / (1024 * 1024)\n",
+    "print(f\"Saved local bundle file size: {file_size_mb:.2f} MB\")\n",
+    "\n",
+    "print(f\"\\n✅ Script finished ({'FAST MODE' if FAST_MODE else 'FULL MODE'}).\")"
+   ]
+  }
+ ],
  "metadata": {
   "kernelspec": {
    "display_name": "Streamlit Notebook",
    "name": "streamlit"
   },
   "lastEditStatus": {
+   "authorEmail": "[email protected]",
    "authorId": "374530764978",
    "authorName": "BRAD",
+   "lastEditTime": 1750870004305,
+   "notebookId": "6rovstl42ft2p5id6gwo",
+   "sessionId": "65561efa-4d18-4072-8f4d-10240cb902ba"
   }
  },
  "nbformat": 4,
+ "nbformat_minor": 5
+}

inpatient_feature_importance.csv CHANGED Viewed

@@ -2091,4 +2091,4 @@ MODEL_RUN_ID,MODEL_NAME,TARGET_NAME,FEATURE_NAME,IMPORTANCE_VALUE,IMPORTANCE_RAN
 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_85,0.04926689992,745,2025-06-18 21:11:35.095
 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_107,0.03908581065,746,2025-06-18 21:11:35.095
 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_108,0.03781863024,747,2025-06-18 21:11:35.095
-03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,cms_ischemic_heart_disease,0.02849259634,748,2025-06-18 21:11:35.095

 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_85,0.04926689992,745,2025-06-18 21:11:35.095
 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_107,0.03908581065,746,2025-06-18 21:11:35.095
 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_108,0.03781863024,747,2025-06-18 21:11:35.095
+03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,cms_ischemic_heart_disease,0.02849259634,748,2025-06-18 21:11:35.095