bradmontierth commited on
Commit
2f706be
·
1 Parent(s): 8cd7f61

fixing cut off training script

Browse files
Train Tuva Concurrent Inpatient Models.ipynb CHANGED
@@ -1,31 +1,709 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "metadata": {
3
  "kernelspec": {
4
  "display_name": "Streamlit Notebook",
5
  "name": "streamlit"
6
  },
7
  "lastEditStatus": {
8
- "notebookId": "6rovstl42ft2p5id6gwo",
9
  "authorId": "374530764978",
10
  "authorName": "BRAD",
11
- "authorEmail": "[email protected]",
12
- "sessionId": "65561efa-4d18-4072-8f4d-10240cb902ba",
13
- "lastEditTime": 1750870004305
14
  }
15
  },
16
- "nbformat_minor": 5,
17
  "nbformat": 4,
18
- "cells": [
19
- {
20
- "cell_type": "code",
21
- "id": "3775908f-ca36-4846-8f38-5adca39217f2",
22
- "metadata": {
23
- "language": "python",
24
- "name": "cell1"
25
- },
26
- "source": "0]}\")\n \n base_readmit_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, solver='liblinear')\n \n # Determine the feature set to use.\n if FAST_MODE:\n print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n best_readmission_features = X_train_base.columns.tolist()\n else:\n best_readmission_features = find_best_feature_subset(\n model=base_readmit_model, X_train=X_train_base, y_train=y_train_base, X_val=X_calib_read, y_val=y_calib_read,\n scoring_func=roc_auc_score, higher_is_better=True, model_name=\"Readmission (Logistic Regression)\"\n )\n\n print(f\"\\nTraining final Readmission model pipeline using {len(best_readmission_features)} features...\")\n base_model_for_calib = clone(base_readmit_model)\n base_model_for_calib.fit(X_train_base[best_readmission_features], y_train_base)\n \n # Log feature importances from the base (uncalibrated) model.\n uncal_read_model_name = f\"Inpatient_Readmission_Base_Uncalibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n log_feature_importances_to_snowflake(session, base_model_for_calib, best_readmission_features, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION, FEATURE_IMPORTANCE_TABLE_NAME)\n \n # Evaluate and log metrics for the uncalibrated model for comparison.\n y_pred_proba_uncal = base_model_for_calib.predict_proba(X_test_read[best_readmission_features])[:, 1]\n uncalibrated_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_uncal)\n log_model_metrics_to_snowflake(session, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION + \"_Probability\", uncalibrated_metrics, \"Binary_Uncalibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n \n # Calibrate the model on the held-out calibration set.\n calibrated_readmission_model = CalibratedClassifierCV(base_model_for_calib, method='isotonic', cv='prefit')\n calibrated_readmission_model.fit(X_calib_read[best_readmission_features], y_calib_read)\n y_pred_proba_cal = calibrated_readmission_model.predict_proba(X_test_read[best_readmission_features])[:, 1]\n\n print(\"\\nCalibrated Readmission Model - Test Set Evaluation:\")\n calibrated_proba_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_cal)\n for k, v in calibrated_proba_metrics.items(): print(f\" {k}: {v:.4f}\")\n \n cal_read_model_name = f\"Inpatient_Readmission_Calibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n log_model_metrics_to_snowflake(session, MODEL_RUN_ID, cal_read_model_name, TARGET_READMISSION + \"_Probability\", calibrated_proba_metrics, \"Binary_Calibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n\n# --- 4.3 Model 3: Predicting Discharge Location (Multiclass Classification) ---\nprint(\"\\n\" + \"=\"*80)\nprint(\"--- Training Model 3: Calibrated Discharge Location ---\")\nprint(\"=\"*80)\nTARGET_DISCHARGE = 'discharge_location'\ncalibrated_discharge_model, le_discharge, best_discharge_features = None, None, None\n\nif TARGET_DISCHARGE not in df_pd.columns:\n print(f\"Error: Target column '{TARGET_DISCHARGE}' not found. Skipping Discharge Location model.\")\nelse:\n le_discharge = LabelEncoder()\n y_discharge_encoded = le_discharge.fit_transform(df_pd[TARGET_DISCHARGE])\n num_classes_discharge = len(le_discharge.classes_)\n print(f\"Discharge Location: {num_classes_discharge} classes found: {le_discharge.classes_}\")\n \n # Split data: 60% base train, 20% calibration, 20% test\n stratify_discharge = y_discharge_encoded if num_classes_discharge > 1 else None\n X_train_full_disc, X_test_disc, y_train_full_disc_enc, y_test_disc_enc = train_test_split(X, y_discharge_encoded, test_size=0.2, random_state=42, stratify=stratify_discharge)\n X_train_base_disc, X_calib_disc, y_train_base_disc_enc, y_calib_disc_enc = train_test_split(X_train_full_disc, y_train_full_disc_enc, test_size=0.25, random_state=42, stratify=y_train_full_disc_enc if num_classes_discharge > 1 else None)\n print(f\"Data split for discharge: Base train: {X_train_base_disc.shape[0]}, Calibration: {X_calib_disc.shape[0]}, Test: {X_test_disc.shape[0]}\")\n \n base_discharge_model = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', multi_class='multinomial', class_weight='balanced')\n \n # Determine the feature set to use.\n if FAST_MODE:\n print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n best_discharge_features = X_train_base_disc.columns.tolist()\n else:\n best_discharge_features = find_best_feature_subset(\n model=base_discharge_model, X_train=X_train_base_disc, y_train=y_train_base_disc_enc, X_val=X_calib_disc, y_val=y_calib_disc_enc,\n scoring_func=log_loss, higher_is_better=False, model_name=\"Discharge Location (Multinomial Regression)\"\n )\n\n print(f\"\\nTraining final Discharge Location model pipeline using {len(best_discharge_features)} features...\")\n base_model_for_calib_disc = clone(base_discharge_model)\n base_model_for_calib_disc.fit(X_train_base_disc[best_discharge_features], y_train_base_disc_enc)\n \n discharge_model_name = f\"Inpatient_Discharge_Cal_Overall_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n log_feature_importances_to_snowflake(session, base_model_for_calib_disc, best_discharge_features, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, FEATURE_IMPORTANCE_TABLE_NAME)\n \n # Calibrate the model. 'sigmoid' is used for one-vs-rest calibration, suitable for multiclass.\n calibrated_discharge_model = CalibratedClassifierCV(base_model_for_calib_disc, method='sigmoid', cv='prefit')\n calibrated_discharge_model.fit(X_calib_disc[best_discharge_features], y_calib_disc_enc)\n y_pred_proba_discharge_calibrated = calibrated_discharge_model.predict_proba(X_test_disc[best_discharge_features])\n y_pred_labels_discharge_calibrated = calibrated_discharge_model.predict(X_test_disc[best_discharge_features])\n \n print(\"\\nCalibrated Discharge Model - Test Set Evaluation:\")\n calibrated_disc_metrics = calculate_multiclass_classification_metrics(y_test_disc_enc, y_pred_labels_discharge_calibrated, y_pred_proba_discharge_calibrated, le_discharge.classes_)\n \n # Log the overall multiclass metrics.\n overall_cal_metrics_to_log = {k: v for k, v in calibrated_disc_metrics.items() if k != 'per_class_details'}\n overall_cal_metrics_to_log['BRIER_SCORE'] = calibrated_disc_metrics.get('BRIER_SCORE_MACRO_AVG')\n log_model_metrics_to_snowflake(session, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, overall_cal_metrics_to_log, \"Multiclass_Cal_Overall\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n \n # --- FIX: Log the per-class metrics by mapping keys correctly ---\n discharge_class_model_name = f\"Inpatient_Discharge_Cal_Class_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n for class_detail in calibrated_disc_metrics.get('per_class_details', []):\n # Create a new dict with keys the logging function expects.\n per_class_metrics_to_log = {\n 'BRIER_SCORE': class_detail.get('brier_score'),\n 'AVG_Y_PRED': class_detail.get('avg_pred_proba'),\n 'AVG_Y_TRUE': class_detail.get('true_proportion'),\n 'PRED_RATIO': class_detail.get('proba_ratio'),\n }\n log_model_metrics_to_snowflake(\n session, MODEL_RUN_ID, discharge_class_model_name,\n f\"{TARGET_DISCHARGE}_Class_{class_detail['class_name']}\",\n per_class_metrics_to_log, # Use the correctly mapped dictionary\n \"Multiclass_Cal_ClassDetail\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG\n )\n\n print(\"\\nCalibrated Classification Report:\\n\", classification_report(y_test_disc_enc, y_pred_labels_discharge_calibrated, target_names=le_discharge.classes_.astype(str), zero_division=0, digits=4))\n\n\n# =============================================================================\n# 5. MODEL SAVING\n# =============================================================================\nprint(\"\\n\" + \"=\"*80)\nprint(\"--- Saving Models and Artifacts ---\")\nprint(\"=\"*80)\n\n# Bundle all necessary objects for deployment into a single dictionary.\ninpatient_models_bundle = {\n 'los_model': los_model,\n 'readmission_model': calibrated_readmission_model,\n 'discharge_model': calibrated_discharge_model,\n 'feature_columns_los': best_los_features,\n 'feature_columns_readmission': best_readmission_features,\n 'feature_columns_discharge': best_discharge_features,\n 'le_discharge': le_discharge,\n 'model_run_id': MODEL_RUN_ID,\n 'fast_mode': FAST_MODE,\n 'excluded_feature_prefixes': EXCLUDE_FEATURE_PREFIXES\n}\n\n# Create a descriptive file name for the bundle.\nBUNDLE_SUFFIX = \"fast\" if FAST_MODE else \"fs\"\nEXCLUSION_FILE_TAG = f\"_excl_{'-'.join([p.strip('_').lower() for p in EXCLUDE_FEATURE_PREFIXES])}\" if EXCLUDE_FEATURE_PREFIXES else \"\"\nBUNDLE_FILE_NAME = f'inpatient_models_bundle_{MODEL_SOURCE_TAG}_{MODEL_YEAR_TAG}_{BUNDLE_SUFFIX}{EXCLUSION_FILE_TAG}.pkl'\n\n# Save the bundle locally using pickle.\nwith open(BUNDLE_FILE_NAME, 'wb') as f:\n pickle.dump(inpatient_models_bundle, f)\nprint(f\"Models bundled and saved locally to: {BUNDLE_FILE_NAME}\")\n\n# Upload the local bundle file to the specified Snowflake stage.\nput_result = session.file.put(BUNDLE_FILE_NAME, SNOWFLAKE_STAGE_NAME, overwrite=True)\nif put_result[0].status == 'UPLOADED':\n print(f\"Model bundle successfully uploaded to Snowflake stage: {SNOWFLAKE_STAGE_NAME}\")\nelse:\n print(f\"Error uploading model bundle. Status: {put_result[0].status}, Message: {put_result[0].message}\")\n\nfile_size_mb = os.path.getsize(BUNDLE_FILE_NAME) / (1024 * 1024)\nprint(f\"Saved local bundle file size: {file_size_mb:.2f} MB\")\n\nprint(f\"\\n✅ Script finished ({'FAST MODE' if FAST_MODE else 'FULL MODE'}).\")",
27
- "execution_count": null,
28
- "outputs": []
29
- }
30
- ]
31
- }
 
1
  {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "3775908f-ca36-4846-8f38-5adca39217f2",
7
+ "metadata": {
8
+ "language": "python",
9
+ "name": "cell1"
10
+ },
11
+ "outputs": [],
12
+ "source": [
13
+ "\"\"\"\n",
14
+ "End-to-End Inpatient Model Training and Evaluation Script\n",
15
+ "\n",
16
+ "This script performs the following operations for an inpatient dataset:\n",
17
+ "1. Loads data from a Snowflake table.\n",
18
+ "2. Performs data preprocessing, including one-hot encoding of categorical\n",
19
+ " features and standardization of column names.\n",
20
+ "3. Allows for the exclusion of specified feature groups (e.g., 'hcc_').\n",
21
+ "4. Provides a \"FAST_MODE\" to skip computationally intensive feature selection\n",
22
+ " for rapid testing.\n",
23
+ "5. Trains, calibrates, and evaluates three distinct models:\n",
24
+ " a. Length of Stay (Regression with XGBoost).\n",
25
+ " b. Readmission (Binary Classification with Calibrated Logistic Regression).\n",
26
+ " c. Discharge Location (Multiclass Classification with Calibrated Logistic\n",
27
+ " Regression).\n",
28
+ "6. Logs model performance metrics, feature importances, and feature frequency\n",
29
+ " statistics to separate Snowflake tables.\n",
30
+ "7. Saves the trained models, feature lists, and encoders into a single\n",
31
+ " pickle bundle file, then uploads it to a Snowflake stage.\n",
32
+ "\"\"\"\n",
33
+ "\n",
34
+ "import os\n",
35
+ "import pickle\n",
36
+ "import uuid\n",
37
+ "from datetime import datetime\n",
38
+ "\n",
39
+ "import matplotlib.pyplot as plt\n",
40
+ "import numpy as np\n",
41
+ "import pandas as pd\n",
42
+ "import xgboost as xgb\n",
43
+ "from sklearn.base import clone\n",
44
+ "from sklearn.calibration import CalibratedClassifierCV\n",
45
+ "from sklearn.linear_model import LogisticRegression\n",
46
+ "from sklearn.metrics import (\n",
47
+ " accuracy_score,\n",
48
+ " average_precision_score,\n",
49
+ " brier_score_loss,\n",
50
+ " classification_report,\n",
51
+ " log_loss,\n",
52
+ " mean_absolute_error,\n",
53
+ " mean_squared_error,\n",
54
+ " r2_score,\n",
55
+ " roc_auc_score,\n",
56
+ ")\n",
57
+ "from sklearn.model_selection import train_test_split\n",
58
+ "from sklearn.preprocessing import LabelEncoder\n",
59
+ "from snowflake.snowpark.context import get_active_session\n",
60
+ "\n",
61
+ "# =============================================================================\n",
62
+ "# 0. CONFIGURATION\n",
63
+ "# =============================================================================\n",
64
+ "# --- Snowflake Environment Settings ---\n",
65
+ "SNOWFLAKE_DATABASE = \"medicare_lds_five_multi_year\"\n",
66
+ "SNOWFLAKE_SCHEMA = \"BENCHMARKS\"\n",
67
+ "\n",
68
+ "# --- Input and Output Table/Stage Names ---\n",
69
+ "INPUT_TABLE = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.BENCHMARKS_INPATIENT_INPUT\"\n",
70
+ "METRICS_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.MODEL_EVAL_METRICS_INPATIENT\"\n",
71
+ "FEATURE_FREQ_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.FEATURE_FREQUENCY_STATS_INPATIENT\"\n",
72
+ "FEATURE_IMPORTANCE_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.MODEL_FEATURE_IMPORTANCE_INPATIENT\"\n",
73
+ "SNOWFLAKE_STAGE_NAME = f\"@{SNOWFLAKE_SCHEMA}.BENCHMARK_STAGE\"\n",
74
+ "\n",
75
+ "# --- Model Run Metadata ---\n",
76
+ "# A unique ID for this entire script run.\n",
77
+ "MODEL_RUN_ID = str(uuid.uuid4())\n",
78
+ "# Tags to identify the source and year of the data used for training.\n",
79
+ "MODEL_SOURCE_TAG = \"medicare_lds\"\n",
80
+ "MODEL_YEAR_TAG = \"2023\"\n",
81
+ "\n",
82
+ "# --- Feature Exclusion Switch ---\n",
83
+ "# Define a list of feature prefixes to exclude from the model.\n",
84
+ "# For example, to exclude all HCC features, use [\"hcc_\"]. Set to [] for no exclusions.\n",
85
+ "EXCLUDE_FEATURE_PREFIXES = [\"hcc_\"]\n",
86
+ "\n",
87
+ "# --- Development Mode Switch ---\n",
88
+ "# If True, the script skips the computationally expensive feature selection step.\n",
89
+ "# This is useful for quick runs to test the script's functionality.\n",
90
+ "# Set to False for a full production run to find the optimal feature set.\n",
91
+ "FAST_MODE = False\n",
92
+ "\n",
93
+ "# =============================================================================\n",
94
+ "# 1. SETUP: SNOWFLAKE SESSION & SCRIPT INITIALIZATION\n",
95
+ "# =============================================================================\n",
96
+ "session = get_active_session()\n",
97
+ "print(f\"Active session created. Model Run ID: {MODEL_RUN_ID}\")\n",
98
+ "\n",
99
+ "if FAST_MODE:\n",
100
+ " print(\"\\n\" + \"=\" * 50)\n",
101
+ " print(\"🚀 FAST MODE IS ENABLED 🚀\")\n",
102
+ " print(\"Feature selection will be skipped for all models.\")\n",
103
+ " print(\"=\" * 50 + \"\\n\")\n",
104
+ "\n",
105
+ "\n",
106
+ "# =============================================================================\n",
107
+ "# 2. DATA LOADING & PREPARATION\n",
108
+ "# =============================================================================\n",
109
+ "first_month = f\"{MODEL_YEAR_TAG}01\"\n",
110
+ "\n",
111
+ "print(\"Loading and preparing data from Snowflake...\")\n",
112
+ "query = f\"\"\"\n",
113
+ "SELECT *\n",
114
+ "FROM {INPUT_TABLE}\n",
115
+ "WHERE YEAR_NBR = {MODEL_YEAR_TAG}\n",
116
+ "AND FIRST_MONTH = {first_month}\n",
117
+ "\"\"\"\n",
118
+ "df_pd = session.sql(query).to_pandas()\n",
119
+ "\n",
120
+ "# Standardize all column names to lowercase for consistency.\n",
121
+ "df_pd.columns = df_pd.columns.str.lower()\n",
122
+ "print(\"Standardized DataFrame column names to lowercase.\")\n",
123
+ "\n",
124
+ "# One-hot encode specified categorical variables.\n",
125
+ "categorical_cols = ['state', 'race', 'sex', 'ms_drg_code', 'ccsr_cat']\n",
126
+ "df_pd_encoded = pd.get_dummies(df_pd, columns=[col for col in categorical_cols if col in df_pd.columns])\n",
127
+ "\n",
128
+ "# Define all potential feature groups.\n",
129
+ "condition_columns = [col for col in df_pd_encoded.columns if col.startswith(('cond_', 'cms_', 'hcc_'))]\n",
130
+ "other_columns = ['age_at_admit']\n",
131
+ "dummy_prefixes = tuple(f'{col}_' for col in categorical_cols)\n",
132
+ "dummy_columns = [col for col in df_pd_encoded.columns if col.startswith(dummy_prefixes)]\n",
133
+ "\n",
134
+ "# Ensure 'age_at_admit' exists before including it.\n",
135
+ "if 'age_at_admit' not in df_pd_encoded.columns and 'age_at_admit' in other_columns:\n",
136
+ " print(\"Warning: 'age_at_admit' not found in features. Removing it.\")\n",
137
+ " other_columns.remove('age_at_admit')\n",
138
+ "\n",
139
+ "# Combine all potential features into a single master list.\n",
140
+ "all_possible_features = other_columns + condition_columns + dummy_columns\n",
141
+ "\n",
142
+ "# Filter out features based on the exclusion configuration.\n",
143
+ "print(f\"Excluding feature prefixes: {EXCLUDE_FEATURE_PREFIXES}\")\n",
144
+ "if EXCLUDE_FEATURE_PREFIXES:\n",
145
+ " initial_feature_count = len(all_possible_features)\n",
146
+ " # A feature is kept if it does NOT start with any of the excluded prefixes.\n",
147
+ " features_to_keep = [\n",
148
+ " f for f in all_possible_features\n",
149
+ " if not any(f.startswith(prefix) for prefix in EXCLUDE_FEATURE_PREFIXES)\n",
150
+ " ]\n",
151
+ " print(f\"Filtered features: Kept {len(features_to_keep)} out of {initial_feature_count} potential features.\")\n",
152
+ "else:\n",
153
+ " features_to_keep = all_possible_features\n",
154
+ " print(\"No feature prefixes specified for exclusion. Using all defined features.\")\n",
155
+ "\n",
156
+ "# The final list of features to be used for training.\n",
157
+ "X_columns = [col for col in features_to_keep if col in df_pd_encoded.columns]\n",
158
+ "X = df_pd_encoded[X_columns]\n",
159
+ "\n",
160
+ "print(f\"Data loaded. Shape of final feature matrix X: {X.shape}\")\n",
161
+ "print(f\"Number of features after exclusion: {len(X_columns)}\")\n",
162
+ "\n",
163
+ "\n",
164
+ "def create_feature_frequency_table_if_not_exists(session, table_name):\n",
165
+ " \"\"\"Ensures the feature frequency statistics table exists in Snowflake.\"\"\"\n",
166
+ " session.sql(f\"\"\"\n",
167
+ " CREATE TABLE IF NOT EXISTS {table_name} (\n",
168
+ " MODEL_RUN_ID STRING,\n",
169
+ " FEATURE_NAME STRING,\n",
170
+ " POSITIVE_COUNT NUMBER,\n",
171
+ " TOTAL_ROWS NUMBER,\n",
172
+ " POSITIVE_RATE_PERCENT FLOAT,\n",
173
+ " EVAL_TS TIMESTAMP_NTZ\n",
174
+ " );\n",
175
+ " \"\"\").collect()\n",
176
+ " print(f\"Ensured feature frequency table {table_name} exists.\")\n",
177
+ "\n",
178
+ "# --- Analyze and log feature sparsity ---\n",
179
+ "create_feature_frequency_table_if_not_exists(session, FEATURE_FREQ_TABLE_NAME)\n",
180
+ "print(\"\\n--- Analysis: Positive Feature Rates (Sparsity Check on Training Data) ---\")\n",
181
+ "total_rows = len(X)\n",
182
+ "positive_counts = (X > 0).sum()\n",
183
+ "positive_rates = (positive_counts / total_rows) * 100\n",
184
+ "positive_rate_summary = pd.DataFrame({\n",
185
+ " 'feature': X.columns,\n",
186
+ " 'positive_count': positive_counts,\n",
187
+ " 'total_rows': total_rows,\n",
188
+ " 'positive_rate_percent': positive_rates\n",
189
+ "}).sort_values(by='positive_rate_percent', ascending=False).reset_index(drop=True)\n",
190
+ "\n",
191
+ "print(\"Positive (non-zero) rates for all features in the final training input (X), sorted descending:\")\n",
192
+ "with pd.option_context('display.max_rows', 20, 'display.max_columns', None, 'display.width', 120):\n",
193
+ " print(positive_rate_summary)\n",
194
+ "\n",
195
+ "print(f\"\\nSaving feature frequency statistics to {FEATURE_FREQ_TABLE_NAME}...\")\n",
196
+ "df_to_save = positive_rate_summary.copy()\n",
197
+ "df_to_save['MODEL_RUN_ID'] = MODEL_RUN_ID\n",
198
+ "df_to_save['EVAL_TS'] = datetime.utcnow()\n",
199
+ "df_to_save.rename(columns={\n",
200
+ " 'feature': 'FEATURE_NAME', 'positive_count': 'POSITIVE_COUNT',\n",
201
+ " 'total_rows': 'TOTAL_ROWS', 'positive_rate_percent': 'POSITIVE_RATE_PERCENT'\n",
202
+ "}, inplace=True)\n",
203
+ "final_column_order = ['MODEL_RUN_ID', 'FEATURE_NAME', 'POSITIVE_COUNT', 'TOTAL_ROWS', 'POSITIVE_RATE_PERCENT', 'EVAL_TS']\n",
204
+ "df_to_save = df_to_save[final_column_order]\n",
205
+ "session.create_dataframe(df_to_save).write.mode(\"append\").save_as_table(FEATURE_FREQ_TABLE_NAME)\n",
206
+ "print(\"Successfully saved feature frequency statistics to Snowflake.\")\n",
207
+ "\n",
208
+ "# =============================================================================\n",
209
+ "# 3. UTILITY FUNCTIONS: METRICS, LOGGING, AND FEATURE SELECTION\n",
210
+ "# =============================================================================\n",
211
+ "\n",
212
+ "def calculate_regression_metrics(y_true, y_pred):\n",
213
+ " \"\"\"Calculates a set of standard regression metrics.\"\"\"\n",
214
+ " y_true_np, y_pred_np = np.array(y_true), np.array(y_pred)\n",
215
+ " sum_y_true, mean_y_true = np.sum(y_true_np), np.mean(y_true_np)\n",
216
+ " pred_ratio = np.sum(y_pred_np) / sum_y_true if sum_y_true != 0 else np.nan\n",
217
+ " mae_percent = (mean_absolute_error(y_true_np, y_pred_np) / mean_y_true) * 100 if mean_y_true != 0 else np.nan\n",
218
+ " return {\n",
219
+ " 'R2': r2_score(y_true_np, y_pred_np), 'MAE': mean_absolute_error(y_true_np, y_pred_np),\n",
220
+ " 'MSE': mean_squared_error(y_true_np, y_pred_np), 'PRED_RATIO': pred_ratio, 'MAE_PERCENT': mae_percent,\n",
221
+ " 'AVG_Y_PRED': np.mean(y_pred_np), 'AVG_Y_TRUE': mean_y_true\n",
222
+ " }\n",
223
+ "\n",
224
+ "def calculate_binary_classification_proba_metrics(y_true, y_pred_proba):\n",
225
+ " \"\"\"Calculates a set of standard binary classification metrics from probabilities.\"\"\"\n",
226
+ " y_true_np, y_pred_proba_np = np.array(y_true), np.array(y_pred_proba)\n",
227
+ " is_multiclass = len(np.unique(y_true_np)) > 1\n",
228
+ " auc_roc = roc_auc_score(y_true_np, y_pred_proba_np) if is_multiclass else np.nan\n",
229
+ " auc_pr = average_precision_score(y_true_np, y_pred_proba_np) if is_multiclass else np.nan\n",
230
+ " return {\n",
231
+ " 'AUC_ROC': auc_roc, 'AUC_PR': auc_pr, 'LOG_LOSS': log_loss(y_true_np, y_pred_proba_np),\n",
232
+ " 'BRIER_SCORE': brier_score_loss(y_true_np, y_pred_proba_np),\n",
233
+ " 'AVG_Y_PRED_PROBA': np.mean(y_pred_proba_np), 'AVG_Y_TRUE': np.mean(y_true_np)\n",
234
+ " }\n",
235
+ "\n",
236
+ "def calculate_multiclass_classification_metrics(y_true_encoded, y_pred_labels, y_pred_proba, le_classes):\n",
237
+ " \"\"\"Calculates overall and per-class metrics for multiclass classification.\"\"\"\n",
238
+ " num_samples, num_classes = len(y_true_encoded), len(le_classes)\n",
239
+ " metrics = {\n",
240
+ " 'ACCURACY': accuracy_score(y_true_encoded, y_pred_labels),\n",
241
+ " 'LOG_LOSS': log_loss(y_true_encoded, y_pred_proba, labels=np.arange(num_classes))\n",
242
+ " }\n",
243
+ " per_class_details, all_brier_scores = [], []\n",
244
+ " if num_samples > 0 and num_classes > 0:\n",
245
+ " for i in range(num_classes):\n",
246
+ " class_name = le_classes[i]\n",
247
+ " true_class_binary = (y_true_encoded == i).astype(int)\n",
248
+ " pred_proba_for_class = y_pred_proba[:, i]\n",
249
+ " avg_pred_proba_class = np.mean(pred_proba_for_class)\n",
250
+ " true_proportion_class = np.mean(true_class_binary)\n",
251
+ " proba_ratio_class = avg_pred_proba_class / true_proportion_class if true_proportion_class > 0 else np.nan\n",
252
+ " brier_score_class = brier_score_loss(true_class_binary, pred_proba_for_class) if len(np.unique(true_class_binary)) > 1 else np.nan\n",
253
+ " all_brier_scores.append(brier_score_class)\n",
254
+ " per_class_details.append({\n",
255
+ " \"class_name\": class_name,\n",
256
+ " \"avg_pred_proba\": avg_pred_proba_class,\n",
257
+ " \"true_proportion\": true_proportion_class,\n",
258
+ " \"proba_ratio\": proba_ratio_class,\n",
259
+ " \"brier_score\": brier_score_class\n",
260
+ " })\n",
261
+ " metrics['per_class_details'] = per_class_details\n",
262
+ " valid_brier_scores = [s for s in all_brier_scores if not np.isnan(s)]\n",
263
+ " metrics['BRIER_SCORE_MACRO_AVG'] = np.mean(valid_brier_scores) if valid_brier_scores else np.nan\n",
264
+ " return metrics\n",
265
+ "\n",
266
+ "def create_metrics_table_if_not_exists(session, table_name):\n",
267
+ " \"\"\"Ensures the main model metrics table exists in Snowflake.\"\"\"\n",
268
+ " session.sql(f\"\"\"\n",
269
+ " CREATE TABLE IF NOT EXISTS {table_name} (\n",
270
+ " MODEL_RUN_ID STRING, MODEL_NAME STRING, TARGET_NAME STRING, R2 FLOAT, MAE FLOAT, MSE FLOAT,\n",
271
+ " PRED_RATIO FLOAT, MAE_PERCENT FLOAT, AUC_ROC FLOAT, AUC_PR FLOAT, LOG_LOSS FLOAT,\n",
272
+ " BRIER_SCORE FLOAT, ACCURACY FLOAT, AVG_Y_PRED FLOAT, AVG_Y_TRUE FLOAT, MODEL_SOURCE STRING,\n",
273
+ " MODEL_TYPE STRING, MODEL_YEAR STRING, EVAL_TS TIMESTAMP_NTZ\n",
274
+ " );\n",
275
+ " \"\"\").collect()\n",
276
+ " print(f\"Ensured metrics table {table_name} exists.\")\n",
277
+ "\n",
278
+ "def create_feature_importance_table_if_not_exists(session, table_name):\n",
279
+ " \"\"\"Ensures the feature importance table exists in Snowflake.\"\"\"\n",
280
+ " session.sql(f\"\"\"\n",
281
+ " CREATE TABLE IF NOT EXISTS {table_name} (\n",
282
+ " MODEL_RUN_ID STRING,\n",
283
+ " MODEL_NAME STRING,\n",
284
+ " TARGET_NAME STRING,\n",
285
+ " FEATURE_NAME STRING,\n",
286
+ " IMPORTANCE_VALUE FLOAT,\n",
287
+ " IMPORTANCE_RANK NUMBER,\n",
288
+ " EVAL_TS TIMESTAMP_NTZ\n",
289
+ " );\n",
290
+ " \"\"\").collect()\n",
291
+ " print(f\"Ensured feature importance table {table_name} exists.\")\n",
292
+ "\n",
293
+ "def log_model_metrics_to_snowflake(session, model_run_id, model_name, target_name, metrics_dict, model_type, metrics_table, model_source_tag, model_year_tag):\n",
294
+ " \"\"\"Constructs a payload and logs model metrics to a Snowflake table.\"\"\"\n",
295
+ " avg_y_pred = metrics_dict.get('AVG_Y_PRED', metrics_dict.get('AVG_Y_PRED_PROBA'))\n",
296
+ " full_metrics_payload = {\n",
297
+ " \"MODEL_RUN_ID\": model_run_id, \"MODEL_NAME\": model_name, \"TARGET_NAME\": target_name,\n",
298
+ " \"R2\": metrics_dict.get('R2'), \"MAE\": metrics_dict.get('MAE'), \"MSE\": metrics_dict.get('MSE'),\n",
299
+ " \"PRED_RATIO\": metrics_dict.get('PRED_RATIO'), \"MAE_PERCENT\": metrics_dict.get('MAE_PERCENT'),\n",
300
+ " \"AUC_ROC\": metrics_dict.get('AUC_ROC'), \"AUC_PR\": metrics_dict.get('AUC_PR'),\n",
301
+ " \"LOG_LOSS\": metrics_dict.get('LOG_LOSS'), \"BRIER_SCORE\": metrics_dict.get('BRIER_SCORE'),\n",
302
+ " \"ACCURACY\": metrics_dict.get('ACCURACY'), \"AVG_Y_PRED\": avg_y_pred,\n",
303
+ " \"AVG_Y_TRUE\": metrics_dict.get('AVG_Y_TRUE'), \"MODEL_SOURCE\": model_source_tag,\n",
304
+ " \"MODEL_TYPE\": model_type, \"MODEL_YEAR\": model_year_tag, \"EVAL_TS\": datetime.utcnow()\n",
305
+ " }\n",
306
+ " # Round floats and handle NaNs for database compatibility\n",
307
+ " for key, value in full_metrics_payload.items():\n",
308
+ " if isinstance(value, (float, np.floating)):\n",
309
+ " full_metrics_payload[key] = round(value, 6) if not np.isnan(value) else None\n",
310
+ " \n",
311
+ " dfm = pd.DataFrame([full_metrics_payload])\n",
312
+ " ordered_cols = [\n",
313
+ " \"MODEL_RUN_ID\", \"MODEL_NAME\", \"TARGET_NAME\", \"R2\", \"MAE\", \"MSE\",\n",
314
+ " \"PRED_RATIO\", \"MAE_PERCENT\", \"AUC_ROC\", \"AUC_PR\", \"LOG_LOSS\",\n",
315
+ " \"BRIER_SCORE\", \"ACCURACY\", \"AVG_Y_PRED\", \"AVG_Y_TRUE\", \"MODEL_SOURCE\",\n",
316
+ " \"MODEL_TYPE\", \"MODEL_YEAR\", \"EVAL_TS\"\n",
317
+ " ]\n",
318
+ " dfm = dfm[ordered_cols]\n",
319
+ " session.create_dataframe(dfm).write.mode(\"append\").save_as_table(metrics_table)\n",
320
+ " print(f\"Logged metrics for {model_name} - {target_name} to {metrics_table}.\")\n",
321
+ "\n",
322
+ "def log_feature_importances_to_snowflake(session, model, feature_names, model_run_id, model_name, target_name, table_name):\n",
323
+ " \"\"\"Extracts, ranks, and logs feature importances to a Snowflake table.\"\"\"\n",
324
+ " if hasattr(model, 'feature_importances_'):\n",
325
+ " importances = model.feature_importances_\n",
326
+ " elif hasattr(model, 'coef_'):\n",
327
+ " # For multi-class logistic regression, average the absolute coefficients across classes\n",
328
+ " importances = np.mean(np.abs(model.coef_), axis=0) if model.coef_.ndim > 1 else np.abs(model.coef_[0])\n",
329
+ " else:\n",
330
+ " print(f\"Warning: Model type for '{model_name}' does not have 'feature_importances_' or 'coef_'. Skipping importance logging.\")\n",
331
+ " return\n",
332
+ "\n",
333
+ " importance_df = pd.DataFrame({'FEATURE_NAME': feature_names, 'IMPORTANCE_VALUE': importances})\n",
334
+ " importance_df = importance_df.sort_values(by='IMPORTANCE_VALUE', ascending=False).reset_index(drop=True)\n",
335
+ " importance_df['IMPORTANCE_RANK'] = importance_df.index + 1\n",
336
+ " importance_df['MODEL_RUN_ID'] = model_run_id\n",
337
+ " importance_df['MODEL_NAME'] = model_name\n",
338
+ " importance_df['TARGET_NAME'] = target_name\n",
339
+ " importance_df['EVAL_TS'] = datetime.utcnow()\n",
340
+ "\n",
341
+ " final_cols = ['MODEL_RUN_ID', 'MODEL_NAME', 'TARGET_NAME', 'FEATURE_NAME', 'IMPORTANCE_VALUE', 'IMPORTANCE_RANK', 'EVAL_TS']\n",
342
+ " importance_df = importance_df[final_cols]\n",
343
+ " \n",
344
+ " session.create_dataframe(importance_df).write.mode(\"append\").save_as_table(table_name)\n",
345
+ " print(f\"Logged {len(importance_df)} feature importances for {model_name} - {target_name} to {table_name}.\")\n",
346
+ "\n",
347
+ "def find_best_feature_subset(model, X_train, y_train, X_val, y_val, scoring_func, higher_is_better, model_name, feature_counts_to_test=None):\n",
348
+ " \"\"\"\n",
349
+ " Performs recursive feature elimination to find the most parsimonious feature set.\n",
350
+ "\n",
351
+ " This function first ranks all features by importance, then iteratively tests smaller\n",
352
+ " subsets of the top features. It selects the smallest feature set that performs\n",
353
+ " within a small tolerance of the absolute best-performing set.\n",
354
+ " \"\"\"\n",
355
+ " print(f\"\\n--- [{model_name}] Starting feature selection process ---\")\n",
356
+ " \n",
357
+ " # Step 1: Rank all features by importance using the full training set.\n",
358
+ " print(\"Step 1: Ranking all features by importance...\")\n",
359
+ " ranker_model = clone(model)\n",
360
+ " ranker_model.fit(X_train, y_train)\n",
361
+ "\n",
362
+ " if hasattr(ranker_model, 'feature_importances_'):\n",
363
+ " importances = ranker_model.feature_importances_\n",
364
+ " elif hasattr(ranker_model, 'coef_'):\n",
365
+ " importances = np.mean(np.abs(ranker_model.coef_), axis=0) if ranker_model.coef_.ndim > 1 else np.abs(ranker_model.coef_[0])\n",
366
+ " else:\n",
367
+ " raise TypeError(\"Model type not supported for feature importance extraction.\")\n",
368
+ "\n",
369
+ " feature_importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': importances}).sort_values('importance', ascending=False)\n",
370
+ " ranked_features = feature_importance_df['feature'].tolist()\n",
371
+ " \n",
372
+ " # Step 2: Evaluate model performance on different feature subset sizes.\n",
373
+ " print(\"Step 2: Evaluating model performance on different feature subset sizes...\")\n",
374
+ " if feature_counts_to_test is None:\n",
375
+ " n_total_features = len(ranked_features)\n",
376
+ " # Define a dynamic set of feature counts to test.\n",
377
+ " fractions = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.15, 0.1, 0.05]\n",
378
+ " fractional_counts = [int(n_total_features * f) for f in fractions]\n",
379
+ " absolute_counts = [250, 200, 150, 100, 75, 50, 25, 10]\n",
380
+ " counts = set([n_total_features] + fractional_counts + absolute_counts)\n",
381
+ " feature_counts_to_test = sorted([n for n in counts if 0 < n <= n_total_features], reverse=True)\n",
382
+ " print(f\"Generated feature counts to test: {feature_counts_to_test}\")\n",
383
+ " \n",
384
+ " val_scores = []\n",
385
+ " for n_features in feature_counts_to_test:\n",
386
+ " subset_features = ranked_features[:n_features]\n",
387
+ " loop_model = clone(model)\n",
388
+ " loop_model.fit(X_train[subset_features], y_train)\n",
389
+ " \n",
390
+ " # Predict based on model type (regressor vs. classifier).\n",
391
+ " if hasattr(loop_model, 'predict_proba'):\n",
392
+ " val_preds = loop_model.predict_proba(X_val[subset_features])\n",
393
+ " # For binary classification, use the probability of the positive class.\n",
394
+ " if len(np.unique(y_train)) == 2 and val_preds.shape[1] == 2:\n",
395
+ " val_preds = val_preds[:, 1]\n",
396
+ " else:\n",
397
+ " val_preds = loop_model.predict(X_val[subset_features])\n",
398
+ "\n",
399
+ " val_score = scoring_func(y_val, val_preds)\n",
400
+ " val_scores.append(val_score)\n",
401
+ " print(f\" - Tested with {n_features:4} features: Validation Score = {val_score:.4f}\")\n",
402
+ "\n",
403
+ " # Step 3: Select the most parsimonious model within a tolerance of the best score.\n",
404
+ " TOLERANCE = 0.01 # 1% tolerance\n",
405
+ " if higher_is_better:\n",
406
+ " best_val_score = np.max(val_scores)\n",
407
+ " score_threshold = best_val_score * (1 - TOLERANCE)\n",
408
+ " candidate_indices = np.where(val_scores >= score_threshold)[0]\n",
409
+ " else: # lower is better (e.g., for MSE or LogLoss)\n",
410
+ " best_val_score = np.min(val_scores)\n",
411
+ " score_threshold = best_val_score * (1 + TOLERANCE)\n",
412
+ " candidate_indices = np.where(val_scores <= score_threshold)[0]\n",
413
+ " \n",
414
+ " # The last index in the candidates corresponds to the smallest model (most parsimonious).\n",
415
+ " best_parsimonious_idx = candidate_indices[-1]\n",
416
+ " best_score_idx = np.argmax(val_scores) if higher_is_better else np.argmin(val_scores)\n",
417
+ " optimal_n_features = feature_counts_to_test[best_parsimonious_idx]\n",
418
+ " best_features = ranked_features[:optimal_n_features]\n",
419
+ " \n",
420
+ " print(f\"\\nStep 3: Found optimal feature set using the parsimony principle.\")\n",
421
+ " print(f\" - Absolute best validation score: {val_scores[best_score_idx]:.4f} with {feature_counts_to_test[best_score_idx]} features.\")\n",
422
+ " print(f\" - Score threshold (with {TOLERANCE*100}% tolerance): {score_threshold:.4f}\")\n",
423
+ " print(f\" - Chosen parsimonious model: {val_scores[best_parsimonious_idx]:.4f} with {optimal_n_features} features.\")\n",
424
+ " \n",
425
+ " # Plot the results for visualization.\n",
426
+ " plt.figure(figsize=(12, 7))\n",
427
+ " plt.plot(feature_counts_to_test, val_scores, 'o-', label=f'Validation Set Score ({scoring_func.__name__})')\n",
428
+ " plt.axvline(x=feature_counts_to_test[best_score_idx], color='grey', linestyle=':', label=f'Absolute Best Score ({feature_counts_to_test[best_score_idx]} features)')\n",
429
+ " plt.axvline(x=optimal_n_features, color='r', linestyle='--', label=f'Chosen Parsimonious Model ({optimal_n_features} features)')\n",
430
+ " plt.title(f'[{model_name}] Performance vs. Number of Features')\n",
431
+ " plt.xlabel('Number of Top Features Used')\n",
432
+ " plt.ylabel('Score')\n",
433
+ " plt.legend()\n",
434
+ " plt.grid(True, which='both', linestyle='--', linewidth=0.5)\n",
435
+ " plt.gca().invert_xaxis()\n",
436
+ " plt.show()\n",
437
+ " \n",
438
+ " return best_features\n",
439
+ "\n",
440
+ "# Ensure logging tables exist before training begins.\n",
441
+ "create_metrics_table_if_not_exists(session, METRICS_TABLE_NAME)\n",
442
+ "create_feature_importance_table_if_not_exists(session, FEATURE_IMPORTANCE_TABLE_NAME)\n",
443
+ "\n",
444
+ "# =============================================================================\n",
445
+ "# 4. MODEL TRAINING & EVALUATION\n",
446
+ "# =============================================================================\n",
447
+ "\n",
448
+ "# --- Define dynamic model name suffixes based on configuration ---\n",
449
+ "MODEL_NAME_SUFFIX = \"Fast\" if FAST_MODE else \"FeatureSelected\"\n",
450
+ "if EXCLUDE_FEATURE_PREFIXES:\n",
451
+ " cleaned_prefixes = [p.strip('_') for p in EXCLUDE_FEATURE_PREFIXES]\n",
452
+ " exclusion_tag = \"-\".join(cleaned_prefixes)\n",
453
+ " EXCLUSION_SUFFIX = f\"_Excl-{exclusion_tag}\"\n",
454
+ "else:\n",
455
+ " EXCLUSION_SUFFIX = \"\"\n",
456
+ "\n",
457
+ "# --- 4.1 Model 1: Predicting Length of Stay (Regression) ---\n",
458
+ "print(\"\\n\" + \"=\"*80)\n",
459
+ "print(\"--- Training Model 1: Length of Stay (LOS) with Gamma Objective ---\")\n",
460
+ "print(\"=\"*80)\n",
461
+ "TARGET_LOS = 'length_of_stay'\n",
462
+ "los_model, best_los_features = None, None\n",
463
+ "\n",
464
+ "if TARGET_LOS not in df_pd.columns:\n",
465
+ " print(f\"Error: Target column '{TARGET_LOS}' not found. Skipping LOS model.\")\n",
466
+ "else:\n",
467
+ " y_los = df_pd[TARGET_LOS].astype(float)\n",
468
+ " # The Gamma objective requires positive target values.\n",
469
+ " if (y_los <= 0).any():\n",
470
+ " print(f\"Warning: Found {(y_los <= 0).sum()} non-positive values in '{TARGET_LOS}'. Clamping to a small positive number.\")\n",
471
+ " y_los = y_los.clip(lower=0.001)\n",
472
+ "\n",
473
+ " X_train_los, X_test_los, y_train_los, y_test_los = train_test_split(X, y_los, test_size=0.2, random_state=42)\n",
474
+ " X_train_fs_los, X_val_fs_los, y_train_fs_los, y_val_fs_los = train_test_split(X_train_los, y_train_los, test_size=0.25, random_state=42)\n",
475
+ "\n",
476
+ " base_los_model = xgb.XGBRegressor(objective='reg:gamma', random_state=42, n_estimators=1000, learning_rate=0.05, max_depth=7, subsample=0.8, colsample_bytree=0.8, eval_metric='gamma-deviance')\n",
477
+ " \n",
478
+ " # Determine the feature set to use.\n",
479
+ " if FAST_MODE:\n",
480
+ " print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
481
+ " best_los_features = X.columns.tolist()\n",
482
+ " else:\n",
483
+ " best_los_features = find_best_feature_subset(\n",
484
+ " model=base_los_model,\n",
485
+ " X_train=X_train_fs_los, y_train=y_train_fs_los,\n",
486
+ " X_val=X_val_fs_los, y_val=y_val_fs_los,\n",
487
+ " scoring_func=mean_squared_error,\n",
488
+ " higher_is_better=False,\n",
489
+ " model_name=\"Length of Stay (XGBoost)\"\n",
490
+ " )\n",
491
+ "\n",
492
+ " print(f\"\\nTraining final LOS model using {len(best_los_features)} features...\")\n",
493
+ " los_model = clone(base_los_model)\n",
494
+ " los_model.set_params(early_stopping_rounds=50)\n",
495
+ " eval_set = [(X_val_fs_los[best_los_features], y_val_fs_los)]\n",
496
+ " los_model.fit(X_train_los[best_los_features], y_train_los, eval_set=eval_set, verbose=False)\n",
497
+ " print(f\"Optimal number of trees found via early stopping: {los_model.best_iteration}\")\n",
498
+ "\n",
499
+ " y_pred_los = los_model.predict(X_test_los[best_los_features])\n",
500
+ "\n",
501
+ " print(\"\\nLOS Model - Test Set Evaluation:\")\n",
502
+ " los_metrics = calculate_regression_metrics(y_test_los, y_pred_los)\n",
503
+ " for k, v in los_metrics.items(): print(f\" {k}: {v:.4f}\")\n",
504
+ "\n",
505
+ " # Log metrics and feature importances.\n",
506
+ " los_model_name = f\"Inpatient_LOS_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
507
+ " log_model_metrics_to_snowflake(session, MODEL_RUN_ID, los_model_name, TARGET_LOS, los_metrics, \"Regression\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
508
+ " log_feature_importances_to_snowflake(session, los_model, best_los_features, MODEL_RUN_ID, los_model_name, TARGET_LOS, FEATURE_IMPORTANCE_TABLE_NAME)\n",
509
+ "\n",
510
+ "# --- 4.2 Model 2: Predicting Readmission (Binary Classification) ---\n",
511
+ "print(\"\\n\" + \"=\"*80)\n",
512
+ "print(\"--- Training Model 2: Calibrated Readmission Probability ---\")\n",
513
+ "print(\"=\"*80)\n",
514
+ "TARGET_READMISSION = 'readmission_numerator'\n",
515
+ "DENOMINATOR_COL = 'readmission_denominator'\n",
516
+ "calibrated_readmission_model, best_readmission_features = None, None\n",
517
+ "\n",
518
+ "if TARGET_READMISSION not in df_pd.columns or DENOMINATOR_COL not in df_pd.columns:\n",
519
+ " print(f\"Error: Required columns '{TARGET_READMISSION}' or '{DENOMINATOR_COL}' not found. Skipping Readmission model.\")\n",
520
+ "else:\n",
521
+ " # Filter data to only include encounters eligible for readmission.\n",
522
+ " readmission_filter_mask = df_pd[DENOMINATOR_COL] == 1\n",
523
+ " if readmission_filter_mask.sum() == 0:\n",
524
+ " print(\"Error: The filter condition resulted in zero encounters. Skipping Readmission model.\")\n",
525
+ " else:\n",
526
+ " print(f\"Filtering data for Readmission Model where '{DENOMINATOR_COL}' = 1 ({readmission_filter_mask.sum()} rows).\")\n",
527
+ " X_readmission = X.loc[readmission_filter_mask].reset_index(drop=True)\n",
528
+ " y_readmission = df_pd.loc[readmission_filter_mask, TARGET_READMISSION].reset_index(drop=True)\n",
529
+ " \n",
530
+ " # Split data: 60% base train, 20% calibration, 20% test\n",
531
+ " stratify_readmission = y_readmission if len(np.unique(y_readmission)) > 1 else None\n",
532
+ " X_train_full, X_test_read, y_train_full, y_test_read = train_test_split(X_readmission, y_readmission, test_size=0.2, random_state=42, stratify=stratify_readmission)\n",
533
+ " stratify_y_train_full = y_train_full if len(np.unique(y_train_full)) > 1 else None\n",
534
+ " X_train_base, X_calib_read, y_train_base, y_calib_read = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=stratify_y_train_full)\n",
535
+ " print(f\"Data split for readmission: Base train: {X_train_base.shape[0]}, Calibration: {X_calib_read.shape[0]}, Test: {X_test_read.shape[0]}\")\n",
536
+ " \n",
537
+ " base_readmit_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, solver='liblinear')\n",
538
+ " \n",
539
+ " # Determine the feature set to use.\n",
540
+ " if FAST_MODE:\n",
541
+ " print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
542
+ " best_readmission_features = X_train_base.columns.tolist()\n",
543
+ " else:\n",
544
+ " best_readmission_features = find_best_feature_subset(\n",
545
+ " model=base_readmit_model, X_train=X_train_base, y_train=y_train_base, X_val=X_calib_read, y_val=y_calib_read,\n",
546
+ " scoring_func=roc_auc_score, higher_is_better=True, model_name=\"Readmission (Logistic Regression)\"\n",
547
+ " )\n",
548
+ "\n",
549
+ " print(f\"\\nTraining final Readmission model pipeline using {len(best_readmission_features)} features...\")\n",
550
+ " base_model_for_calib = clone(base_readmit_model)\n",
551
+ " base_model_for_calib.fit(X_train_base[best_readmission_features], y_train_base)\n",
552
+ " \n",
553
+ " # Log feature importances from the base (uncalibrated) model.\n",
554
+ " uncal_read_model_name = f\"Inpatient_Readmission_Base_Uncalibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
555
+ " log_feature_importances_to_snowflake(session, base_model_for_calib, best_readmission_features, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION, FEATURE_IMPORTANCE_TABLE_NAME)\n",
556
+ " \n",
557
+ " # Evaluate and log metrics for the uncalibrated model for comparison.\n",
558
+ " y_pred_proba_uncal = base_model_for_calib.predict_proba(X_test_read[best_readmission_features])[:, 1]\n",
559
+ " uncalibrated_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_uncal)\n",
560
+ " log_model_metrics_to_snowflake(session, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION + \"_Probability\", uncalibrated_metrics, \"Binary_Uncalibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
561
+ " \n",
562
+ " # Calibrate the model on the held-out calibration set.\n",
563
+ " calibrated_readmission_model = CalibratedClassifierCV(base_model_for_calib, method='isotonic', cv='prefit')\n",
564
+ " calibrated_readmission_model.fit(X_calib_read[best_readmission_features], y_calib_read)\n",
565
+ " y_pred_proba_cal = calibrated_readmission_model.predict_proba(X_test_read[best_readmission_features])[:, 1]\n",
566
+ "\n",
567
+ " print(\"\\nCalibrated Readmission Model - Test Set Evaluation:\")\n",
568
+ " calibrated_proba_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_cal)\n",
569
+ " for k, v in calibrated_proba_metrics.items(): print(f\" {k}: {v:.4f}\")\n",
570
+ " \n",
571
+ " cal_read_model_name = f\"Inpatient_Readmission_Calibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
572
+ " log_model_metrics_to_snowflake(session, MODEL_RUN_ID, cal_read_model_name, TARGET_READMISSION + \"_Probability\", calibrated_proba_metrics, \"Binary_Calibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
573
+ "\n",
574
+ "# --- 4.3 Model 3: Predicting Discharge Location (Multiclass Classification) ---\n",
575
+ "print(\"\\n\" + \"=\"*80)\n",
576
+ "print(\"--- Training Model 3: Calibrated Discharge Location ---\")\n",
577
+ "print(\"=\"*80)\n",
578
+ "TARGET_DISCHARGE = 'discharge_location'\n",
579
+ "calibrated_discharge_model, le_discharge, best_discharge_features = None, None, None\n",
580
+ "\n",
581
+ "if TARGET_DISCHARGE not in df_pd.columns:\n",
582
+ " print(f\"Error: Target column '{TARGET_DISCHARGE}' not found. Skipping Discharge Location model.\")\n",
583
+ "else:\n",
584
+ " le_discharge = LabelEncoder()\n",
585
+ " y_discharge_encoded = le_discharge.fit_transform(df_pd[TARGET_DISCHARGE])\n",
586
+ " num_classes_discharge = len(le_discharge.classes_)\n",
587
+ " print(f\"Discharge Location: {num_classes_discharge} classes found: {le_discharge.classes_}\")\n",
588
+ " \n",
589
+ " # Split data: 60% base train, 20% calibration, 20% test\n",
590
+ " stratify_discharge = y_discharge_encoded if num_classes_discharge > 1 else None\n",
591
+ " X_train_full_disc, X_test_disc, y_train_full_disc_enc, y_test_disc_enc = train_test_split(X, y_discharge_encoded, test_size=0.2, random_state=42, stratify=stratify_discharge)\n",
592
+ " X_train_base_disc, X_calib_disc, y_train_base_disc_enc, y_calib_disc_enc = train_test_split(X_train_full_disc, y_train_full_disc_enc, test_size=0.25, random_state=42, stratify=y_train_full_disc_enc if num_classes_discharge > 1 else None)\n",
593
+ " print(f\"Data split for discharge: Base train: {X_train_base_disc.shape[0]}, Calibration: {X_calib_disc.shape[0]}, Test: {X_test_disc.shape[0]}\")\n",
594
+ " \n",
595
+ " base_discharge_model = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', multi_class='multinomial', class_weight='balanced')\n",
596
+ " \n",
597
+ " # Determine the feature set to use.\n",
598
+ " if FAST_MODE:\n",
599
+ " print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
600
+ " best_discharge_features = X_train_base_disc.columns.tolist()\n",
601
+ " else:\n",
602
+ " best_discharge_features = find_best_feature_subset(\n",
603
+ " model=base_discharge_model, X_train=X_train_base_disc, y_train=y_train_base_disc_enc, X_val=X_calib_disc, y_val=y_calib_disc_enc,\n",
604
+ " scoring_func=log_loss, higher_is_better=False, model_name=\"Discharge Location (Multinomial Regression)\"\n",
605
+ " )\n",
606
+ "\n",
607
+ " print(f\"\\nTraining final Discharge Location model pipeline using {len(best_discharge_features)} features...\")\n",
608
+ " base_model_for_calib_disc = clone(base_discharge_model)\n",
609
+ " base_model_for_calib_disc.fit(X_train_base_disc[best_discharge_features], y_train_base_disc_enc)\n",
610
+ " \n",
611
+ " discharge_model_name = f\"Inpatient_Discharge_Cal_Overall_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
612
+ " log_feature_importances_to_snowflake(session, base_model_for_calib_disc, best_discharge_features, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, FEATURE_IMPORTANCE_TABLE_NAME)\n",
613
+ " \n",
614
+ " # Calibrate the model. 'sigmoid' is used for one-vs-rest calibration, suitable for multiclass.\n",
615
+ " calibrated_discharge_model = CalibratedClassifierCV(base_model_for_calib_disc, method='sigmoid', cv='prefit')\n",
616
+ " calibrated_discharge_model.fit(X_calib_disc[best_discharge_features], y_calib_disc_enc)\n",
617
+ " y_pred_proba_discharge_calibrated = calibrated_discharge_model.predict_proba(X_test_disc[best_discharge_features])\n",
618
+ " y_pred_labels_discharge_calibrated = calibrated_discharge_model.predict(X_test_disc[best_discharge_features])\n",
619
+ " \n",
620
+ " print(\"\\nCalibrated Discharge Model - Test Set Evaluation:\")\n",
621
+ " calibrated_disc_metrics = calculate_multiclass_classification_metrics(y_test_disc_enc, y_pred_labels_discharge_calibrated, y_pred_proba_discharge_calibrated, le_discharge.classes_)\n",
622
+ " \n",
623
+ " # Log the overall multiclass metrics.\n",
624
+ " overall_cal_metrics_to_log = {k: v for k, v in calibrated_disc_metrics.items() if k != 'per_class_details'}\n",
625
+ " overall_cal_metrics_to_log['BRIER_SCORE'] = calibrated_disc_metrics.get('BRIER_SCORE_MACRO_AVG')\n",
626
+ " log_model_metrics_to_snowflake(session, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, overall_cal_metrics_to_log, \"Multiclass_Cal_Overall\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
627
+ " \n",
628
+ " # --- FIX: Log the per-class metrics by mapping keys correctly ---\n",
629
+ " discharge_class_model_name = f\"Inpatient_Discharge_Cal_Class_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
630
+ " for class_detail in calibrated_disc_metrics.get('per_class_details', []):\n",
631
+ " # Create a new dict with keys the logging function expects.\n",
632
+ " per_class_metrics_to_log = {\n",
633
+ " 'BRIER_SCORE': class_detail.get('brier_score'),\n",
634
+ " 'AVG_Y_PRED': class_detail.get('avg_pred_proba'),\n",
635
+ " 'AVG_Y_TRUE': class_detail.get('true_proportion'),\n",
636
+ " 'PRED_RATIO': class_detail.get('proba_ratio'),\n",
637
+ " }\n",
638
+ " log_model_metrics_to_snowflake(\n",
639
+ " session, MODEL_RUN_ID, discharge_class_model_name,\n",
640
+ " f\"{TARGET_DISCHARGE}_Class_{class_detail['class_name']}\",\n",
641
+ " per_class_metrics_to_log, # Use the correctly mapped dictionary\n",
642
+ " \"Multiclass_Cal_ClassDetail\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG\n",
643
+ " )\n",
644
+ "\n",
645
+ " print(\"\\nCalibrated Classification Report:\\n\", classification_report(y_test_disc_enc, y_pred_labels_discharge_calibrated, target_names=le_discharge.classes_.astype(str), zero_division=0, digits=4))\n",
646
+ "\n",
647
+ "\n",
648
+ "# =============================================================================\n",
649
+ "# 5. MODEL SAVING\n",
650
+ "# =============================================================================\n",
651
+ "print(\"\\n\" + \"=\"*80)\n",
652
+ "print(\"--- Saving Models and Artifacts ---\")\n",
653
+ "print(\"=\"*80)\n",
654
+ "\n",
655
+ "# Bundle all necessary objects for deployment into a single dictionary.\n",
656
+ "inpatient_models_bundle = {\n",
657
+ " 'los_model': los_model,\n",
658
+ " 'readmission_model': calibrated_readmission_model,\n",
659
+ " 'discharge_model': calibrated_discharge_model,\n",
660
+ " 'feature_columns_los': best_los_features,\n",
661
+ " 'feature_columns_readmission': best_readmission_features,\n",
662
+ " 'feature_columns_discharge': best_discharge_features,\n",
663
+ " 'le_discharge': le_discharge,\n",
664
+ " 'model_run_id': MODEL_RUN_ID,\n",
665
+ " 'fast_mode': FAST_MODE,\n",
666
+ " 'excluded_feature_prefixes': EXCLUDE_FEATURE_PREFIXES\n",
667
+ "}\n",
668
+ "\n",
669
+ "# Create a descriptive file name for the bundle.\n",
670
+ "BUNDLE_SUFFIX = \"fast\" if FAST_MODE else \"fs\"\n",
671
+ "EXCLUSION_FILE_TAG = f\"_excl_{'-'.join([p.strip('_').lower() for p in EXCLUDE_FEATURE_PREFIXES])}\" if EXCLUDE_FEATURE_PREFIXES else \"\"\n",
672
+ "BUNDLE_FILE_NAME = f'inpatient_models_bundle_{MODEL_SOURCE_TAG}_{MODEL_YEAR_TAG}_{BUNDLE_SUFFIX}{EXCLUSION_FILE_TAG}.pkl'\n",
673
+ "\n",
674
+ "# Save the bundle locally using pickle.\n",
675
+ "with open(BUNDLE_FILE_NAME, 'wb') as f:\n",
676
+ " pickle.dump(inpatient_models_bundle, f)\n",
677
+ "print(f\"Models bundled and saved locally to: {BUNDLE_FILE_NAME}\")\n",
678
+ "\n",
679
+ "# Upload the local bundle file to the specified Snowflake stage.\n",
680
+ "put_result = session.file.put(BUNDLE_FILE_NAME, SNOWFLAKE_STAGE_NAME, overwrite=True)\n",
681
+ "if put_result[0].status == 'UPLOADED':\n",
682
+ " print(f\"Model bundle successfully uploaded to Snowflake stage: {SNOWFLAKE_STAGE_NAME}\")\n",
683
+ "else:\n",
684
+ " print(f\"Error uploading model bundle. Status: {put_result[0].status}, Message: {put_result[0].message}\")\n",
685
+ "\n",
686
+ "file_size_mb = os.path.getsize(BUNDLE_FILE_NAME) / (1024 * 1024)\n",
687
+ "print(f\"Saved local bundle file size: {file_size_mb:.2f} MB\")\n",
688
+ "\n",
689
+ "print(f\"\\n✅ Script finished ({'FAST MODE' if FAST_MODE else 'FULL MODE'}).\")"
690
+ ]
691
+ }
692
+ ],
693
  "metadata": {
694
  "kernelspec": {
695
  "display_name": "Streamlit Notebook",
696
  "name": "streamlit"
697
  },
698
  "lastEditStatus": {
699
+ "authorEmail": "[email protected]",
700
  "authorId": "374530764978",
701
  "authorName": "BRAD",
702
+ "lastEditTime": 1750870004305,
703
+ "notebookId": "6rovstl42ft2p5id6gwo",
704
+ "sessionId": "65561efa-4d18-4072-8f4d-10240cb902ba"
705
  }
706
  },
 
707
  "nbformat": 4,
708
+ "nbformat_minor": 5
709
+ }
 
 
 
 
 
 
 
 
 
 
 
 
inpatient_feature_importance.csv CHANGED
@@ -2091,4 +2091,4 @@ MODEL_RUN_ID,MODEL_NAME,TARGET_NAME,FEATURE_NAME,IMPORTANCE_VALUE,IMPORTANCE_RAN
2091
  03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_85,0.04926689992,745,2025-06-18 21:11:35.095
2092
  03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_107,0.03908581065,746,2025-06-18 21:11:35.095
2093
  03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_108,0.03781863024,747,2025-06-18 21:11:35.095
2094
- 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,cms_ischemic_heart_disease,0.02849259634,748,2025-06-18 21:11:35.095
 
2091
  03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_85,0.04926689992,745,2025-06-18 21:11:35.095
2092
  03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_107,0.03908581065,746,2025-06-18 21:11:35.095
2093
  03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_108,0.03781863024,747,2025-06-18 21:11:35.095
2094
+ 03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,cms_ischemic_heart_disease,0.02849259634,748,2025-06-18 21:11:35.095