bradmontierth
commited on
Commit
·
2f706be
1
Parent(s):
8cd7f61
fixing cut off training script
Browse files
Train Tuva Concurrent Inpatient Models.ipynb
CHANGED
@@ -1,31 +1,709 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"metadata": {
|
3 |
"kernelspec": {
|
4 |
"display_name": "Streamlit Notebook",
|
5 |
"name": "streamlit"
|
6 |
},
|
7 |
"lastEditStatus": {
|
8 |
-
"
|
9 |
"authorId": "374530764978",
|
10 |
"authorName": "BRAD",
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"
|
14 |
}
|
15 |
},
|
16 |
-
"nbformat_minor": 5,
|
17 |
"nbformat": 4,
|
18 |
-
"
|
19 |
-
|
20 |
-
"cell_type": "code",
|
21 |
-
"id": "3775908f-ca36-4846-8f38-5adca39217f2",
|
22 |
-
"metadata": {
|
23 |
-
"language": "python",
|
24 |
-
"name": "cell1"
|
25 |
-
},
|
26 |
-
"source": "0]}\")\n \n base_readmit_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, solver='liblinear')\n \n # Determine the feature set to use.\n if FAST_MODE:\n print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n best_readmission_features = X_train_base.columns.tolist()\n else:\n best_readmission_features = find_best_feature_subset(\n model=base_readmit_model, X_train=X_train_base, y_train=y_train_base, X_val=X_calib_read, y_val=y_calib_read,\n scoring_func=roc_auc_score, higher_is_better=True, model_name=\"Readmission (Logistic Regression)\"\n )\n\n print(f\"\\nTraining final Readmission model pipeline using {len(best_readmission_features)} features...\")\n base_model_for_calib = clone(base_readmit_model)\n base_model_for_calib.fit(X_train_base[best_readmission_features], y_train_base)\n \n # Log feature importances from the base (uncalibrated) model.\n uncal_read_model_name = f\"Inpatient_Readmission_Base_Uncalibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n log_feature_importances_to_snowflake(session, base_model_for_calib, best_readmission_features, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION, FEATURE_IMPORTANCE_TABLE_NAME)\n \n # Evaluate and log metrics for the uncalibrated model for comparison.\n y_pred_proba_uncal = base_model_for_calib.predict_proba(X_test_read[best_readmission_features])[:, 1]\n uncalibrated_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_uncal)\n log_model_metrics_to_snowflake(session, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION + \"_Probability\", uncalibrated_metrics, \"Binary_Uncalibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n \n # Calibrate the model on the held-out calibration set.\n calibrated_readmission_model = CalibratedClassifierCV(base_model_for_calib, method='isotonic', cv='prefit')\n calibrated_readmission_model.fit(X_calib_read[best_readmission_features], y_calib_read)\n y_pred_proba_cal = calibrated_readmission_model.predict_proba(X_test_read[best_readmission_features])[:, 1]\n\n print(\"\\nCalibrated Readmission Model - Test Set Evaluation:\")\n calibrated_proba_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_cal)\n for k, v in calibrated_proba_metrics.items(): print(f\" {k}: {v:.4f}\")\n \n cal_read_model_name = f\"Inpatient_Readmission_Calibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n log_model_metrics_to_snowflake(session, MODEL_RUN_ID, cal_read_model_name, TARGET_READMISSION + \"_Probability\", calibrated_proba_metrics, \"Binary_Calibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n\n# --- 4.3 Model 3: Predicting Discharge Location (Multiclass Classification) ---\nprint(\"\\n\" + \"=\"*80)\nprint(\"--- Training Model 3: Calibrated Discharge Location ---\")\nprint(\"=\"*80)\nTARGET_DISCHARGE = 'discharge_location'\ncalibrated_discharge_model, le_discharge, best_discharge_features = None, None, None\n\nif TARGET_DISCHARGE not in df_pd.columns:\n print(f\"Error: Target column '{TARGET_DISCHARGE}' not found. Skipping Discharge Location model.\")\nelse:\n le_discharge = LabelEncoder()\n y_discharge_encoded = le_discharge.fit_transform(df_pd[TARGET_DISCHARGE])\n num_classes_discharge = len(le_discharge.classes_)\n print(f\"Discharge Location: {num_classes_discharge} classes found: {le_discharge.classes_}\")\n \n # Split data: 60% base train, 20% calibration, 20% test\n stratify_discharge = y_discharge_encoded if num_classes_discharge > 1 else None\n X_train_full_disc, X_test_disc, y_train_full_disc_enc, y_test_disc_enc = train_test_split(X, y_discharge_encoded, test_size=0.2, random_state=42, stratify=stratify_discharge)\n X_train_base_disc, X_calib_disc, y_train_base_disc_enc, y_calib_disc_enc = train_test_split(X_train_full_disc, y_train_full_disc_enc, test_size=0.25, random_state=42, stratify=y_train_full_disc_enc if num_classes_discharge > 1 else None)\n print(f\"Data split for discharge: Base train: {X_train_base_disc.shape[0]}, Calibration: {X_calib_disc.shape[0]}, Test: {X_test_disc.shape[0]}\")\n \n base_discharge_model = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', multi_class='multinomial', class_weight='balanced')\n \n # Determine the feature set to use.\n if FAST_MODE:\n print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n best_discharge_features = X_train_base_disc.columns.tolist()\n else:\n best_discharge_features = find_best_feature_subset(\n model=base_discharge_model, X_train=X_train_base_disc, y_train=y_train_base_disc_enc, X_val=X_calib_disc, y_val=y_calib_disc_enc,\n scoring_func=log_loss, higher_is_better=False, model_name=\"Discharge Location (Multinomial Regression)\"\n )\n\n print(f\"\\nTraining final Discharge Location model pipeline using {len(best_discharge_features)} features...\")\n base_model_for_calib_disc = clone(base_discharge_model)\n base_model_for_calib_disc.fit(X_train_base_disc[best_discharge_features], y_train_base_disc_enc)\n \n discharge_model_name = f\"Inpatient_Discharge_Cal_Overall_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n log_feature_importances_to_snowflake(session, base_model_for_calib_disc, best_discharge_features, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, FEATURE_IMPORTANCE_TABLE_NAME)\n \n # Calibrate the model. 'sigmoid' is used for one-vs-rest calibration, suitable for multiclass.\n calibrated_discharge_model = CalibratedClassifierCV(base_model_for_calib_disc, method='sigmoid', cv='prefit')\n calibrated_discharge_model.fit(X_calib_disc[best_discharge_features], y_calib_disc_enc)\n y_pred_proba_discharge_calibrated = calibrated_discharge_model.predict_proba(X_test_disc[best_discharge_features])\n y_pred_labels_discharge_calibrated = calibrated_discharge_model.predict(X_test_disc[best_discharge_features])\n \n print(\"\\nCalibrated Discharge Model - Test Set Evaluation:\")\n calibrated_disc_metrics = calculate_multiclass_classification_metrics(y_test_disc_enc, y_pred_labels_discharge_calibrated, y_pred_proba_discharge_calibrated, le_discharge.classes_)\n \n # Log the overall multiclass metrics.\n overall_cal_metrics_to_log = {k: v for k, v in calibrated_disc_metrics.items() if k != 'per_class_details'}\n overall_cal_metrics_to_log['BRIER_SCORE'] = calibrated_disc_metrics.get('BRIER_SCORE_MACRO_AVG')\n log_model_metrics_to_snowflake(session, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, overall_cal_metrics_to_log, \"Multiclass_Cal_Overall\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n \n # --- FIX: Log the per-class metrics by mapping keys correctly ---\n discharge_class_model_name = f\"Inpatient_Discharge_Cal_Class_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n for class_detail in calibrated_disc_metrics.get('per_class_details', []):\n # Create a new dict with keys the logging function expects.\n per_class_metrics_to_log = {\n 'BRIER_SCORE': class_detail.get('brier_score'),\n 'AVG_Y_PRED': class_detail.get('avg_pred_proba'),\n 'AVG_Y_TRUE': class_detail.get('true_proportion'),\n 'PRED_RATIO': class_detail.get('proba_ratio'),\n }\n log_model_metrics_to_snowflake(\n session, MODEL_RUN_ID, discharge_class_model_name,\n f\"{TARGET_DISCHARGE}_Class_{class_detail['class_name']}\",\n per_class_metrics_to_log, # Use the correctly mapped dictionary\n \"Multiclass_Cal_ClassDetail\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG\n )\n\n print(\"\\nCalibrated Classification Report:\\n\", classification_report(y_test_disc_enc, y_pred_labels_discharge_calibrated, target_names=le_discharge.classes_.astype(str), zero_division=0, digits=4))\n\n\n# =============================================================================\n# 5. MODEL SAVING\n# =============================================================================\nprint(\"\\n\" + \"=\"*80)\nprint(\"--- Saving Models and Artifacts ---\")\nprint(\"=\"*80)\n\n# Bundle all necessary objects for deployment into a single dictionary.\ninpatient_models_bundle = {\n 'los_model': los_model,\n 'readmission_model': calibrated_readmission_model,\n 'discharge_model': calibrated_discharge_model,\n 'feature_columns_los': best_los_features,\n 'feature_columns_readmission': best_readmission_features,\n 'feature_columns_discharge': best_discharge_features,\n 'le_discharge': le_discharge,\n 'model_run_id': MODEL_RUN_ID,\n 'fast_mode': FAST_MODE,\n 'excluded_feature_prefixes': EXCLUDE_FEATURE_PREFIXES\n}\n\n# Create a descriptive file name for the bundle.\nBUNDLE_SUFFIX = \"fast\" if FAST_MODE else \"fs\"\nEXCLUSION_FILE_TAG = f\"_excl_{'-'.join([p.strip('_').lower() for p in EXCLUDE_FEATURE_PREFIXES])}\" if EXCLUDE_FEATURE_PREFIXES else \"\"\nBUNDLE_FILE_NAME = f'inpatient_models_bundle_{MODEL_SOURCE_TAG}_{MODEL_YEAR_TAG}_{BUNDLE_SUFFIX}{EXCLUSION_FILE_TAG}.pkl'\n\n# Save the bundle locally using pickle.\nwith open(BUNDLE_FILE_NAME, 'wb') as f:\n pickle.dump(inpatient_models_bundle, f)\nprint(f\"Models bundled and saved locally to: {BUNDLE_FILE_NAME}\")\n\n# Upload the local bundle file to the specified Snowflake stage.\nput_result = session.file.put(BUNDLE_FILE_NAME, SNOWFLAKE_STAGE_NAME, overwrite=True)\nif put_result[0].status == 'UPLOADED':\n print(f\"Model bundle successfully uploaded to Snowflake stage: {SNOWFLAKE_STAGE_NAME}\")\nelse:\n print(f\"Error uploading model bundle. Status: {put_result[0].status}, Message: {put_result[0].message}\")\n\nfile_size_mb = os.path.getsize(BUNDLE_FILE_NAME) / (1024 * 1024)\nprint(f\"Saved local bundle file size: {file_size_mb:.2f} MB\")\n\nprint(f\"\\n✅ Script finished ({'FAST MODE' if FAST_MODE else 'FULL MODE'}).\")",
|
27 |
-
"execution_count": null,
|
28 |
-
"outputs": []
|
29 |
-
}
|
30 |
-
]
|
31 |
-
}
|
|
|
1 |
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "3775908f-ca36-4846-8f38-5adca39217f2",
|
7 |
+
"metadata": {
|
8 |
+
"language": "python",
|
9 |
+
"name": "cell1"
|
10 |
+
},
|
11 |
+
"outputs": [],
|
12 |
+
"source": [
|
13 |
+
"\"\"\"\n",
|
14 |
+
"End-to-End Inpatient Model Training and Evaluation Script\n",
|
15 |
+
"\n",
|
16 |
+
"This script performs the following operations for an inpatient dataset:\n",
|
17 |
+
"1. Loads data from a Snowflake table.\n",
|
18 |
+
"2. Performs data preprocessing, including one-hot encoding of categorical\n",
|
19 |
+
" features and standardization of column names.\n",
|
20 |
+
"3. Allows for the exclusion of specified feature groups (e.g., 'hcc_').\n",
|
21 |
+
"4. Provides a \"FAST_MODE\" to skip computationally intensive feature selection\n",
|
22 |
+
" for rapid testing.\n",
|
23 |
+
"5. Trains, calibrates, and evaluates three distinct models:\n",
|
24 |
+
" a. Length of Stay (Regression with XGBoost).\n",
|
25 |
+
" b. Readmission (Binary Classification with Calibrated Logistic Regression).\n",
|
26 |
+
" c. Discharge Location (Multiclass Classification with Calibrated Logistic\n",
|
27 |
+
" Regression).\n",
|
28 |
+
"6. Logs model performance metrics, feature importances, and feature frequency\n",
|
29 |
+
" statistics to separate Snowflake tables.\n",
|
30 |
+
"7. Saves the trained models, feature lists, and encoders into a single\n",
|
31 |
+
" pickle bundle file, then uploads it to a Snowflake stage.\n",
|
32 |
+
"\"\"\"\n",
|
33 |
+
"\n",
|
34 |
+
"import os\n",
|
35 |
+
"import pickle\n",
|
36 |
+
"import uuid\n",
|
37 |
+
"from datetime import datetime\n",
|
38 |
+
"\n",
|
39 |
+
"import matplotlib.pyplot as plt\n",
|
40 |
+
"import numpy as np\n",
|
41 |
+
"import pandas as pd\n",
|
42 |
+
"import xgboost as xgb\n",
|
43 |
+
"from sklearn.base import clone\n",
|
44 |
+
"from sklearn.calibration import CalibratedClassifierCV\n",
|
45 |
+
"from sklearn.linear_model import LogisticRegression\n",
|
46 |
+
"from sklearn.metrics import (\n",
|
47 |
+
" accuracy_score,\n",
|
48 |
+
" average_precision_score,\n",
|
49 |
+
" brier_score_loss,\n",
|
50 |
+
" classification_report,\n",
|
51 |
+
" log_loss,\n",
|
52 |
+
" mean_absolute_error,\n",
|
53 |
+
" mean_squared_error,\n",
|
54 |
+
" r2_score,\n",
|
55 |
+
" roc_auc_score,\n",
|
56 |
+
")\n",
|
57 |
+
"from sklearn.model_selection import train_test_split\n",
|
58 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
59 |
+
"from snowflake.snowpark.context import get_active_session\n",
|
60 |
+
"\n",
|
61 |
+
"# =============================================================================\n",
|
62 |
+
"# 0. CONFIGURATION\n",
|
63 |
+
"# =============================================================================\n",
|
64 |
+
"# --- Snowflake Environment Settings ---\n",
|
65 |
+
"SNOWFLAKE_DATABASE = \"medicare_lds_five_multi_year\"\n",
|
66 |
+
"SNOWFLAKE_SCHEMA = \"BENCHMARKS\"\n",
|
67 |
+
"\n",
|
68 |
+
"# --- Input and Output Table/Stage Names ---\n",
|
69 |
+
"INPUT_TABLE = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.BENCHMARKS_INPATIENT_INPUT\"\n",
|
70 |
+
"METRICS_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.MODEL_EVAL_METRICS_INPATIENT\"\n",
|
71 |
+
"FEATURE_FREQ_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.FEATURE_FREQUENCY_STATS_INPATIENT\"\n",
|
72 |
+
"FEATURE_IMPORTANCE_TABLE_NAME = f\"{SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.MODEL_FEATURE_IMPORTANCE_INPATIENT\"\n",
|
73 |
+
"SNOWFLAKE_STAGE_NAME = f\"@{SNOWFLAKE_SCHEMA}.BENCHMARK_STAGE\"\n",
|
74 |
+
"\n",
|
75 |
+
"# --- Model Run Metadata ---\n",
|
76 |
+
"# A unique ID for this entire script run.\n",
|
77 |
+
"MODEL_RUN_ID = str(uuid.uuid4())\n",
|
78 |
+
"# Tags to identify the source and year of the data used for training.\n",
|
79 |
+
"MODEL_SOURCE_TAG = \"medicare_lds\"\n",
|
80 |
+
"MODEL_YEAR_TAG = \"2023\"\n",
|
81 |
+
"\n",
|
82 |
+
"# --- Feature Exclusion Switch ---\n",
|
83 |
+
"# Define a list of feature prefixes to exclude from the model.\n",
|
84 |
+
"# For example, to exclude all HCC features, use [\"hcc_\"]. Set to [] for no exclusions.\n",
|
85 |
+
"EXCLUDE_FEATURE_PREFIXES = [\"hcc_\"]\n",
|
86 |
+
"\n",
|
87 |
+
"# --- Development Mode Switch ---\n",
|
88 |
+
"# If True, the script skips the computationally expensive feature selection step.\n",
|
89 |
+
"# This is useful for quick runs to test the script's functionality.\n",
|
90 |
+
"# Set to False for a full production run to find the optimal feature set.\n",
|
91 |
+
"FAST_MODE = False\n",
|
92 |
+
"\n",
|
93 |
+
"# =============================================================================\n",
|
94 |
+
"# 1. SETUP: SNOWFLAKE SESSION & SCRIPT INITIALIZATION\n",
|
95 |
+
"# =============================================================================\n",
|
96 |
+
"session = get_active_session()\n",
|
97 |
+
"print(f\"Active session created. Model Run ID: {MODEL_RUN_ID}\")\n",
|
98 |
+
"\n",
|
99 |
+
"if FAST_MODE:\n",
|
100 |
+
" print(\"\\n\" + \"=\" * 50)\n",
|
101 |
+
" print(\"🚀 FAST MODE IS ENABLED 🚀\")\n",
|
102 |
+
" print(\"Feature selection will be skipped for all models.\")\n",
|
103 |
+
" print(\"=\" * 50 + \"\\n\")\n",
|
104 |
+
"\n",
|
105 |
+
"\n",
|
106 |
+
"# =============================================================================\n",
|
107 |
+
"# 2. DATA LOADING & PREPARATION\n",
|
108 |
+
"# =============================================================================\n",
|
109 |
+
"first_month = f\"{MODEL_YEAR_TAG}01\"\n",
|
110 |
+
"\n",
|
111 |
+
"print(\"Loading and preparing data from Snowflake...\")\n",
|
112 |
+
"query = f\"\"\"\n",
|
113 |
+
"SELECT *\n",
|
114 |
+
"FROM {INPUT_TABLE}\n",
|
115 |
+
"WHERE YEAR_NBR = {MODEL_YEAR_TAG}\n",
|
116 |
+
"AND FIRST_MONTH = {first_month}\n",
|
117 |
+
"\"\"\"\n",
|
118 |
+
"df_pd = session.sql(query).to_pandas()\n",
|
119 |
+
"\n",
|
120 |
+
"# Standardize all column names to lowercase for consistency.\n",
|
121 |
+
"df_pd.columns = df_pd.columns.str.lower()\n",
|
122 |
+
"print(\"Standardized DataFrame column names to lowercase.\")\n",
|
123 |
+
"\n",
|
124 |
+
"# One-hot encode specified categorical variables.\n",
|
125 |
+
"categorical_cols = ['state', 'race', 'sex', 'ms_drg_code', 'ccsr_cat']\n",
|
126 |
+
"df_pd_encoded = pd.get_dummies(df_pd, columns=[col for col in categorical_cols if col in df_pd.columns])\n",
|
127 |
+
"\n",
|
128 |
+
"# Define all potential feature groups.\n",
|
129 |
+
"condition_columns = [col for col in df_pd_encoded.columns if col.startswith(('cond_', 'cms_', 'hcc_'))]\n",
|
130 |
+
"other_columns = ['age_at_admit']\n",
|
131 |
+
"dummy_prefixes = tuple(f'{col}_' for col in categorical_cols)\n",
|
132 |
+
"dummy_columns = [col for col in df_pd_encoded.columns if col.startswith(dummy_prefixes)]\n",
|
133 |
+
"\n",
|
134 |
+
"# Ensure 'age_at_admit' exists before including it.\n",
|
135 |
+
"if 'age_at_admit' not in df_pd_encoded.columns and 'age_at_admit' in other_columns:\n",
|
136 |
+
" print(\"Warning: 'age_at_admit' not found in features. Removing it.\")\n",
|
137 |
+
" other_columns.remove('age_at_admit')\n",
|
138 |
+
"\n",
|
139 |
+
"# Combine all potential features into a single master list.\n",
|
140 |
+
"all_possible_features = other_columns + condition_columns + dummy_columns\n",
|
141 |
+
"\n",
|
142 |
+
"# Filter out features based on the exclusion configuration.\n",
|
143 |
+
"print(f\"Excluding feature prefixes: {EXCLUDE_FEATURE_PREFIXES}\")\n",
|
144 |
+
"if EXCLUDE_FEATURE_PREFIXES:\n",
|
145 |
+
" initial_feature_count = len(all_possible_features)\n",
|
146 |
+
" # A feature is kept if it does NOT start with any of the excluded prefixes.\n",
|
147 |
+
" features_to_keep = [\n",
|
148 |
+
" f for f in all_possible_features\n",
|
149 |
+
" if not any(f.startswith(prefix) for prefix in EXCLUDE_FEATURE_PREFIXES)\n",
|
150 |
+
" ]\n",
|
151 |
+
" print(f\"Filtered features: Kept {len(features_to_keep)} out of {initial_feature_count} potential features.\")\n",
|
152 |
+
"else:\n",
|
153 |
+
" features_to_keep = all_possible_features\n",
|
154 |
+
" print(\"No feature prefixes specified for exclusion. Using all defined features.\")\n",
|
155 |
+
"\n",
|
156 |
+
"# The final list of features to be used for training.\n",
|
157 |
+
"X_columns = [col for col in features_to_keep if col in df_pd_encoded.columns]\n",
|
158 |
+
"X = df_pd_encoded[X_columns]\n",
|
159 |
+
"\n",
|
160 |
+
"print(f\"Data loaded. Shape of final feature matrix X: {X.shape}\")\n",
|
161 |
+
"print(f\"Number of features after exclusion: {len(X_columns)}\")\n",
|
162 |
+
"\n",
|
163 |
+
"\n",
|
164 |
+
"def create_feature_frequency_table_if_not_exists(session, table_name):\n",
|
165 |
+
" \"\"\"Ensures the feature frequency statistics table exists in Snowflake.\"\"\"\n",
|
166 |
+
" session.sql(f\"\"\"\n",
|
167 |
+
" CREATE TABLE IF NOT EXISTS {table_name} (\n",
|
168 |
+
" MODEL_RUN_ID STRING,\n",
|
169 |
+
" FEATURE_NAME STRING,\n",
|
170 |
+
" POSITIVE_COUNT NUMBER,\n",
|
171 |
+
" TOTAL_ROWS NUMBER,\n",
|
172 |
+
" POSITIVE_RATE_PERCENT FLOAT,\n",
|
173 |
+
" EVAL_TS TIMESTAMP_NTZ\n",
|
174 |
+
" );\n",
|
175 |
+
" \"\"\").collect()\n",
|
176 |
+
" print(f\"Ensured feature frequency table {table_name} exists.\")\n",
|
177 |
+
"\n",
|
178 |
+
"# --- Analyze and log feature sparsity ---\n",
|
179 |
+
"create_feature_frequency_table_if_not_exists(session, FEATURE_FREQ_TABLE_NAME)\n",
|
180 |
+
"print(\"\\n--- Analysis: Positive Feature Rates (Sparsity Check on Training Data) ---\")\n",
|
181 |
+
"total_rows = len(X)\n",
|
182 |
+
"positive_counts = (X > 0).sum()\n",
|
183 |
+
"positive_rates = (positive_counts / total_rows) * 100\n",
|
184 |
+
"positive_rate_summary = pd.DataFrame({\n",
|
185 |
+
" 'feature': X.columns,\n",
|
186 |
+
" 'positive_count': positive_counts,\n",
|
187 |
+
" 'total_rows': total_rows,\n",
|
188 |
+
" 'positive_rate_percent': positive_rates\n",
|
189 |
+
"}).sort_values(by='positive_rate_percent', ascending=False).reset_index(drop=True)\n",
|
190 |
+
"\n",
|
191 |
+
"print(\"Positive (non-zero) rates for all features in the final training input (X), sorted descending:\")\n",
|
192 |
+
"with pd.option_context('display.max_rows', 20, 'display.max_columns', None, 'display.width', 120):\n",
|
193 |
+
" print(positive_rate_summary)\n",
|
194 |
+
"\n",
|
195 |
+
"print(f\"\\nSaving feature frequency statistics to {FEATURE_FREQ_TABLE_NAME}...\")\n",
|
196 |
+
"df_to_save = positive_rate_summary.copy()\n",
|
197 |
+
"df_to_save['MODEL_RUN_ID'] = MODEL_RUN_ID\n",
|
198 |
+
"df_to_save['EVAL_TS'] = datetime.utcnow()\n",
|
199 |
+
"df_to_save.rename(columns={\n",
|
200 |
+
" 'feature': 'FEATURE_NAME', 'positive_count': 'POSITIVE_COUNT',\n",
|
201 |
+
" 'total_rows': 'TOTAL_ROWS', 'positive_rate_percent': 'POSITIVE_RATE_PERCENT'\n",
|
202 |
+
"}, inplace=True)\n",
|
203 |
+
"final_column_order = ['MODEL_RUN_ID', 'FEATURE_NAME', 'POSITIVE_COUNT', 'TOTAL_ROWS', 'POSITIVE_RATE_PERCENT', 'EVAL_TS']\n",
|
204 |
+
"df_to_save = df_to_save[final_column_order]\n",
|
205 |
+
"session.create_dataframe(df_to_save).write.mode(\"append\").save_as_table(FEATURE_FREQ_TABLE_NAME)\n",
|
206 |
+
"print(\"Successfully saved feature frequency statistics to Snowflake.\")\n",
|
207 |
+
"\n",
|
208 |
+
"# =============================================================================\n",
|
209 |
+
"# 3. UTILITY FUNCTIONS: METRICS, LOGGING, AND FEATURE SELECTION\n",
|
210 |
+
"# =============================================================================\n",
|
211 |
+
"\n",
|
212 |
+
"def calculate_regression_metrics(y_true, y_pred):\n",
|
213 |
+
" \"\"\"Calculates a set of standard regression metrics.\"\"\"\n",
|
214 |
+
" y_true_np, y_pred_np = np.array(y_true), np.array(y_pred)\n",
|
215 |
+
" sum_y_true, mean_y_true = np.sum(y_true_np), np.mean(y_true_np)\n",
|
216 |
+
" pred_ratio = np.sum(y_pred_np) / sum_y_true if sum_y_true != 0 else np.nan\n",
|
217 |
+
" mae_percent = (mean_absolute_error(y_true_np, y_pred_np) / mean_y_true) * 100 if mean_y_true != 0 else np.nan\n",
|
218 |
+
" return {\n",
|
219 |
+
" 'R2': r2_score(y_true_np, y_pred_np), 'MAE': mean_absolute_error(y_true_np, y_pred_np),\n",
|
220 |
+
" 'MSE': mean_squared_error(y_true_np, y_pred_np), 'PRED_RATIO': pred_ratio, 'MAE_PERCENT': mae_percent,\n",
|
221 |
+
" 'AVG_Y_PRED': np.mean(y_pred_np), 'AVG_Y_TRUE': mean_y_true\n",
|
222 |
+
" }\n",
|
223 |
+
"\n",
|
224 |
+
"def calculate_binary_classification_proba_metrics(y_true, y_pred_proba):\n",
|
225 |
+
" \"\"\"Calculates a set of standard binary classification metrics from probabilities.\"\"\"\n",
|
226 |
+
" y_true_np, y_pred_proba_np = np.array(y_true), np.array(y_pred_proba)\n",
|
227 |
+
" is_multiclass = len(np.unique(y_true_np)) > 1\n",
|
228 |
+
" auc_roc = roc_auc_score(y_true_np, y_pred_proba_np) if is_multiclass else np.nan\n",
|
229 |
+
" auc_pr = average_precision_score(y_true_np, y_pred_proba_np) if is_multiclass else np.nan\n",
|
230 |
+
" return {\n",
|
231 |
+
" 'AUC_ROC': auc_roc, 'AUC_PR': auc_pr, 'LOG_LOSS': log_loss(y_true_np, y_pred_proba_np),\n",
|
232 |
+
" 'BRIER_SCORE': brier_score_loss(y_true_np, y_pred_proba_np),\n",
|
233 |
+
" 'AVG_Y_PRED_PROBA': np.mean(y_pred_proba_np), 'AVG_Y_TRUE': np.mean(y_true_np)\n",
|
234 |
+
" }\n",
|
235 |
+
"\n",
|
236 |
+
"def calculate_multiclass_classification_metrics(y_true_encoded, y_pred_labels, y_pred_proba, le_classes):\n",
|
237 |
+
" \"\"\"Calculates overall and per-class metrics for multiclass classification.\"\"\"\n",
|
238 |
+
" num_samples, num_classes = len(y_true_encoded), len(le_classes)\n",
|
239 |
+
" metrics = {\n",
|
240 |
+
" 'ACCURACY': accuracy_score(y_true_encoded, y_pred_labels),\n",
|
241 |
+
" 'LOG_LOSS': log_loss(y_true_encoded, y_pred_proba, labels=np.arange(num_classes))\n",
|
242 |
+
" }\n",
|
243 |
+
" per_class_details, all_brier_scores = [], []\n",
|
244 |
+
" if num_samples > 0 and num_classes > 0:\n",
|
245 |
+
" for i in range(num_classes):\n",
|
246 |
+
" class_name = le_classes[i]\n",
|
247 |
+
" true_class_binary = (y_true_encoded == i).astype(int)\n",
|
248 |
+
" pred_proba_for_class = y_pred_proba[:, i]\n",
|
249 |
+
" avg_pred_proba_class = np.mean(pred_proba_for_class)\n",
|
250 |
+
" true_proportion_class = np.mean(true_class_binary)\n",
|
251 |
+
" proba_ratio_class = avg_pred_proba_class / true_proportion_class if true_proportion_class > 0 else np.nan\n",
|
252 |
+
" brier_score_class = brier_score_loss(true_class_binary, pred_proba_for_class) if len(np.unique(true_class_binary)) > 1 else np.nan\n",
|
253 |
+
" all_brier_scores.append(brier_score_class)\n",
|
254 |
+
" per_class_details.append({\n",
|
255 |
+
" \"class_name\": class_name,\n",
|
256 |
+
" \"avg_pred_proba\": avg_pred_proba_class,\n",
|
257 |
+
" \"true_proportion\": true_proportion_class,\n",
|
258 |
+
" \"proba_ratio\": proba_ratio_class,\n",
|
259 |
+
" \"brier_score\": brier_score_class\n",
|
260 |
+
" })\n",
|
261 |
+
" metrics['per_class_details'] = per_class_details\n",
|
262 |
+
" valid_brier_scores = [s for s in all_brier_scores if not np.isnan(s)]\n",
|
263 |
+
" metrics['BRIER_SCORE_MACRO_AVG'] = np.mean(valid_brier_scores) if valid_brier_scores else np.nan\n",
|
264 |
+
" return metrics\n",
|
265 |
+
"\n",
|
266 |
+
"def create_metrics_table_if_not_exists(session, table_name):\n",
|
267 |
+
" \"\"\"Ensures the main model metrics table exists in Snowflake.\"\"\"\n",
|
268 |
+
" session.sql(f\"\"\"\n",
|
269 |
+
" CREATE TABLE IF NOT EXISTS {table_name} (\n",
|
270 |
+
" MODEL_RUN_ID STRING, MODEL_NAME STRING, TARGET_NAME STRING, R2 FLOAT, MAE FLOAT, MSE FLOAT,\n",
|
271 |
+
" PRED_RATIO FLOAT, MAE_PERCENT FLOAT, AUC_ROC FLOAT, AUC_PR FLOAT, LOG_LOSS FLOAT,\n",
|
272 |
+
" BRIER_SCORE FLOAT, ACCURACY FLOAT, AVG_Y_PRED FLOAT, AVG_Y_TRUE FLOAT, MODEL_SOURCE STRING,\n",
|
273 |
+
" MODEL_TYPE STRING, MODEL_YEAR STRING, EVAL_TS TIMESTAMP_NTZ\n",
|
274 |
+
" );\n",
|
275 |
+
" \"\"\").collect()\n",
|
276 |
+
" print(f\"Ensured metrics table {table_name} exists.\")\n",
|
277 |
+
"\n",
|
278 |
+
"def create_feature_importance_table_if_not_exists(session, table_name):\n",
|
279 |
+
" \"\"\"Ensures the feature importance table exists in Snowflake.\"\"\"\n",
|
280 |
+
" session.sql(f\"\"\"\n",
|
281 |
+
" CREATE TABLE IF NOT EXISTS {table_name} (\n",
|
282 |
+
" MODEL_RUN_ID STRING,\n",
|
283 |
+
" MODEL_NAME STRING,\n",
|
284 |
+
" TARGET_NAME STRING,\n",
|
285 |
+
" FEATURE_NAME STRING,\n",
|
286 |
+
" IMPORTANCE_VALUE FLOAT,\n",
|
287 |
+
" IMPORTANCE_RANK NUMBER,\n",
|
288 |
+
" EVAL_TS TIMESTAMP_NTZ\n",
|
289 |
+
" );\n",
|
290 |
+
" \"\"\").collect()\n",
|
291 |
+
" print(f\"Ensured feature importance table {table_name} exists.\")\n",
|
292 |
+
"\n",
|
293 |
+
"def log_model_metrics_to_snowflake(session, model_run_id, model_name, target_name, metrics_dict, model_type, metrics_table, model_source_tag, model_year_tag):\n",
|
294 |
+
" \"\"\"Constructs a payload and logs model metrics to a Snowflake table.\"\"\"\n",
|
295 |
+
" avg_y_pred = metrics_dict.get('AVG_Y_PRED', metrics_dict.get('AVG_Y_PRED_PROBA'))\n",
|
296 |
+
" full_metrics_payload = {\n",
|
297 |
+
" \"MODEL_RUN_ID\": model_run_id, \"MODEL_NAME\": model_name, \"TARGET_NAME\": target_name,\n",
|
298 |
+
" \"R2\": metrics_dict.get('R2'), \"MAE\": metrics_dict.get('MAE'), \"MSE\": metrics_dict.get('MSE'),\n",
|
299 |
+
" \"PRED_RATIO\": metrics_dict.get('PRED_RATIO'), \"MAE_PERCENT\": metrics_dict.get('MAE_PERCENT'),\n",
|
300 |
+
" \"AUC_ROC\": metrics_dict.get('AUC_ROC'), \"AUC_PR\": metrics_dict.get('AUC_PR'),\n",
|
301 |
+
" \"LOG_LOSS\": metrics_dict.get('LOG_LOSS'), \"BRIER_SCORE\": metrics_dict.get('BRIER_SCORE'),\n",
|
302 |
+
" \"ACCURACY\": metrics_dict.get('ACCURACY'), \"AVG_Y_PRED\": avg_y_pred,\n",
|
303 |
+
" \"AVG_Y_TRUE\": metrics_dict.get('AVG_Y_TRUE'), \"MODEL_SOURCE\": model_source_tag,\n",
|
304 |
+
" \"MODEL_TYPE\": model_type, \"MODEL_YEAR\": model_year_tag, \"EVAL_TS\": datetime.utcnow()\n",
|
305 |
+
" }\n",
|
306 |
+
" # Round floats and handle NaNs for database compatibility\n",
|
307 |
+
" for key, value in full_metrics_payload.items():\n",
|
308 |
+
" if isinstance(value, (float, np.floating)):\n",
|
309 |
+
" full_metrics_payload[key] = round(value, 6) if not np.isnan(value) else None\n",
|
310 |
+
" \n",
|
311 |
+
" dfm = pd.DataFrame([full_metrics_payload])\n",
|
312 |
+
" ordered_cols = [\n",
|
313 |
+
" \"MODEL_RUN_ID\", \"MODEL_NAME\", \"TARGET_NAME\", \"R2\", \"MAE\", \"MSE\",\n",
|
314 |
+
" \"PRED_RATIO\", \"MAE_PERCENT\", \"AUC_ROC\", \"AUC_PR\", \"LOG_LOSS\",\n",
|
315 |
+
" \"BRIER_SCORE\", \"ACCURACY\", \"AVG_Y_PRED\", \"AVG_Y_TRUE\", \"MODEL_SOURCE\",\n",
|
316 |
+
" \"MODEL_TYPE\", \"MODEL_YEAR\", \"EVAL_TS\"\n",
|
317 |
+
" ]\n",
|
318 |
+
" dfm = dfm[ordered_cols]\n",
|
319 |
+
" session.create_dataframe(dfm).write.mode(\"append\").save_as_table(metrics_table)\n",
|
320 |
+
" print(f\"Logged metrics for {model_name} - {target_name} to {metrics_table}.\")\n",
|
321 |
+
"\n",
|
322 |
+
"def log_feature_importances_to_snowflake(session, model, feature_names, model_run_id, model_name, target_name, table_name):\n",
|
323 |
+
" \"\"\"Extracts, ranks, and logs feature importances to a Snowflake table.\"\"\"\n",
|
324 |
+
" if hasattr(model, 'feature_importances_'):\n",
|
325 |
+
" importances = model.feature_importances_\n",
|
326 |
+
" elif hasattr(model, 'coef_'):\n",
|
327 |
+
" # For multi-class logistic regression, average the absolute coefficients across classes\n",
|
328 |
+
" importances = np.mean(np.abs(model.coef_), axis=0) if model.coef_.ndim > 1 else np.abs(model.coef_[0])\n",
|
329 |
+
" else:\n",
|
330 |
+
" print(f\"Warning: Model type for '{model_name}' does not have 'feature_importances_' or 'coef_'. Skipping importance logging.\")\n",
|
331 |
+
" return\n",
|
332 |
+
"\n",
|
333 |
+
" importance_df = pd.DataFrame({'FEATURE_NAME': feature_names, 'IMPORTANCE_VALUE': importances})\n",
|
334 |
+
" importance_df = importance_df.sort_values(by='IMPORTANCE_VALUE', ascending=False).reset_index(drop=True)\n",
|
335 |
+
" importance_df['IMPORTANCE_RANK'] = importance_df.index + 1\n",
|
336 |
+
" importance_df['MODEL_RUN_ID'] = model_run_id\n",
|
337 |
+
" importance_df['MODEL_NAME'] = model_name\n",
|
338 |
+
" importance_df['TARGET_NAME'] = target_name\n",
|
339 |
+
" importance_df['EVAL_TS'] = datetime.utcnow()\n",
|
340 |
+
"\n",
|
341 |
+
" final_cols = ['MODEL_RUN_ID', 'MODEL_NAME', 'TARGET_NAME', 'FEATURE_NAME', 'IMPORTANCE_VALUE', 'IMPORTANCE_RANK', 'EVAL_TS']\n",
|
342 |
+
" importance_df = importance_df[final_cols]\n",
|
343 |
+
" \n",
|
344 |
+
" session.create_dataframe(importance_df).write.mode(\"append\").save_as_table(table_name)\n",
|
345 |
+
" print(f\"Logged {len(importance_df)} feature importances for {model_name} - {target_name} to {table_name}.\")\n",
|
346 |
+
"\n",
|
347 |
+
"def find_best_feature_subset(model, X_train, y_train, X_val, y_val, scoring_func, higher_is_better, model_name, feature_counts_to_test=None):\n",
|
348 |
+
" \"\"\"\n",
|
349 |
+
" Performs recursive feature elimination to find the most parsimonious feature set.\n",
|
350 |
+
"\n",
|
351 |
+
" This function first ranks all features by importance, then iteratively tests smaller\n",
|
352 |
+
" subsets of the top features. It selects the smallest feature set that performs\n",
|
353 |
+
" within a small tolerance of the absolute best-performing set.\n",
|
354 |
+
" \"\"\"\n",
|
355 |
+
" print(f\"\\n--- [{model_name}] Starting feature selection process ---\")\n",
|
356 |
+
" \n",
|
357 |
+
" # Step 1: Rank all features by importance using the full training set.\n",
|
358 |
+
" print(\"Step 1: Ranking all features by importance...\")\n",
|
359 |
+
" ranker_model = clone(model)\n",
|
360 |
+
" ranker_model.fit(X_train, y_train)\n",
|
361 |
+
"\n",
|
362 |
+
" if hasattr(ranker_model, 'feature_importances_'):\n",
|
363 |
+
" importances = ranker_model.feature_importances_\n",
|
364 |
+
" elif hasattr(ranker_model, 'coef_'):\n",
|
365 |
+
" importances = np.mean(np.abs(ranker_model.coef_), axis=0) if ranker_model.coef_.ndim > 1 else np.abs(ranker_model.coef_[0])\n",
|
366 |
+
" else:\n",
|
367 |
+
" raise TypeError(\"Model type not supported for feature importance extraction.\")\n",
|
368 |
+
"\n",
|
369 |
+
" feature_importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': importances}).sort_values('importance', ascending=False)\n",
|
370 |
+
" ranked_features = feature_importance_df['feature'].tolist()\n",
|
371 |
+
" \n",
|
372 |
+
" # Step 2: Evaluate model performance on different feature subset sizes.\n",
|
373 |
+
" print(\"Step 2: Evaluating model performance on different feature subset sizes...\")\n",
|
374 |
+
" if feature_counts_to_test is None:\n",
|
375 |
+
" n_total_features = len(ranked_features)\n",
|
376 |
+
" # Define a dynamic set of feature counts to test.\n",
|
377 |
+
" fractions = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.15, 0.1, 0.05]\n",
|
378 |
+
" fractional_counts = [int(n_total_features * f) for f in fractions]\n",
|
379 |
+
" absolute_counts = [250, 200, 150, 100, 75, 50, 25, 10]\n",
|
380 |
+
" counts = set([n_total_features] + fractional_counts + absolute_counts)\n",
|
381 |
+
" feature_counts_to_test = sorted([n for n in counts if 0 < n <= n_total_features], reverse=True)\n",
|
382 |
+
" print(f\"Generated feature counts to test: {feature_counts_to_test}\")\n",
|
383 |
+
" \n",
|
384 |
+
" val_scores = []\n",
|
385 |
+
" for n_features in feature_counts_to_test:\n",
|
386 |
+
" subset_features = ranked_features[:n_features]\n",
|
387 |
+
" loop_model = clone(model)\n",
|
388 |
+
" loop_model.fit(X_train[subset_features], y_train)\n",
|
389 |
+
" \n",
|
390 |
+
" # Predict based on model type (regressor vs. classifier).\n",
|
391 |
+
" if hasattr(loop_model, 'predict_proba'):\n",
|
392 |
+
" val_preds = loop_model.predict_proba(X_val[subset_features])\n",
|
393 |
+
" # For binary classification, use the probability of the positive class.\n",
|
394 |
+
" if len(np.unique(y_train)) == 2 and val_preds.shape[1] == 2:\n",
|
395 |
+
" val_preds = val_preds[:, 1]\n",
|
396 |
+
" else:\n",
|
397 |
+
" val_preds = loop_model.predict(X_val[subset_features])\n",
|
398 |
+
"\n",
|
399 |
+
" val_score = scoring_func(y_val, val_preds)\n",
|
400 |
+
" val_scores.append(val_score)\n",
|
401 |
+
" print(f\" - Tested with {n_features:4} features: Validation Score = {val_score:.4f}\")\n",
|
402 |
+
"\n",
|
403 |
+
" # Step 3: Select the most parsimonious model within a tolerance of the best score.\n",
|
404 |
+
" TOLERANCE = 0.01 # 1% tolerance\n",
|
405 |
+
" if higher_is_better:\n",
|
406 |
+
" best_val_score = np.max(val_scores)\n",
|
407 |
+
" score_threshold = best_val_score * (1 - TOLERANCE)\n",
|
408 |
+
" candidate_indices = np.where(val_scores >= score_threshold)[0]\n",
|
409 |
+
" else: # lower is better (e.g., for MSE or LogLoss)\n",
|
410 |
+
" best_val_score = np.min(val_scores)\n",
|
411 |
+
" score_threshold = best_val_score * (1 + TOLERANCE)\n",
|
412 |
+
" candidate_indices = np.where(val_scores <= score_threshold)[0]\n",
|
413 |
+
" \n",
|
414 |
+
" # The last index in the candidates corresponds to the smallest model (most parsimonious).\n",
|
415 |
+
" best_parsimonious_idx = candidate_indices[-1]\n",
|
416 |
+
" best_score_idx = np.argmax(val_scores) if higher_is_better else np.argmin(val_scores)\n",
|
417 |
+
" optimal_n_features = feature_counts_to_test[best_parsimonious_idx]\n",
|
418 |
+
" best_features = ranked_features[:optimal_n_features]\n",
|
419 |
+
" \n",
|
420 |
+
" print(f\"\\nStep 3: Found optimal feature set using the parsimony principle.\")\n",
|
421 |
+
" print(f\" - Absolute best validation score: {val_scores[best_score_idx]:.4f} with {feature_counts_to_test[best_score_idx]} features.\")\n",
|
422 |
+
" print(f\" - Score threshold (with {TOLERANCE*100}% tolerance): {score_threshold:.4f}\")\n",
|
423 |
+
" print(f\" - Chosen parsimonious model: {val_scores[best_parsimonious_idx]:.4f} with {optimal_n_features} features.\")\n",
|
424 |
+
" \n",
|
425 |
+
" # Plot the results for visualization.\n",
|
426 |
+
" plt.figure(figsize=(12, 7))\n",
|
427 |
+
" plt.plot(feature_counts_to_test, val_scores, 'o-', label=f'Validation Set Score ({scoring_func.__name__})')\n",
|
428 |
+
" plt.axvline(x=feature_counts_to_test[best_score_idx], color='grey', linestyle=':', label=f'Absolute Best Score ({feature_counts_to_test[best_score_idx]} features)')\n",
|
429 |
+
" plt.axvline(x=optimal_n_features, color='r', linestyle='--', label=f'Chosen Parsimonious Model ({optimal_n_features} features)')\n",
|
430 |
+
" plt.title(f'[{model_name}] Performance vs. Number of Features')\n",
|
431 |
+
" plt.xlabel('Number of Top Features Used')\n",
|
432 |
+
" plt.ylabel('Score')\n",
|
433 |
+
" plt.legend()\n",
|
434 |
+
" plt.grid(True, which='both', linestyle='--', linewidth=0.5)\n",
|
435 |
+
" plt.gca().invert_xaxis()\n",
|
436 |
+
" plt.show()\n",
|
437 |
+
" \n",
|
438 |
+
" return best_features\n",
|
439 |
+
"\n",
|
440 |
+
"# Ensure logging tables exist before training begins.\n",
|
441 |
+
"create_metrics_table_if_not_exists(session, METRICS_TABLE_NAME)\n",
|
442 |
+
"create_feature_importance_table_if_not_exists(session, FEATURE_IMPORTANCE_TABLE_NAME)\n",
|
443 |
+
"\n",
|
444 |
+
"# =============================================================================\n",
|
445 |
+
"# 4. MODEL TRAINING & EVALUATION\n",
|
446 |
+
"# =============================================================================\n",
|
447 |
+
"\n",
|
448 |
+
"# --- Define dynamic model name suffixes based on configuration ---\n",
|
449 |
+
"MODEL_NAME_SUFFIX = \"Fast\" if FAST_MODE else \"FeatureSelected\"\n",
|
450 |
+
"if EXCLUDE_FEATURE_PREFIXES:\n",
|
451 |
+
" cleaned_prefixes = [p.strip('_') for p in EXCLUDE_FEATURE_PREFIXES]\n",
|
452 |
+
" exclusion_tag = \"-\".join(cleaned_prefixes)\n",
|
453 |
+
" EXCLUSION_SUFFIX = f\"_Excl-{exclusion_tag}\"\n",
|
454 |
+
"else:\n",
|
455 |
+
" EXCLUSION_SUFFIX = \"\"\n",
|
456 |
+
"\n",
|
457 |
+
"# --- 4.1 Model 1: Predicting Length of Stay (Regression) ---\n",
|
458 |
+
"print(\"\\n\" + \"=\"*80)\n",
|
459 |
+
"print(\"--- Training Model 1: Length of Stay (LOS) with Gamma Objective ---\")\n",
|
460 |
+
"print(\"=\"*80)\n",
|
461 |
+
"TARGET_LOS = 'length_of_stay'\n",
|
462 |
+
"los_model, best_los_features = None, None\n",
|
463 |
+
"\n",
|
464 |
+
"if TARGET_LOS not in df_pd.columns:\n",
|
465 |
+
" print(f\"Error: Target column '{TARGET_LOS}' not found. Skipping LOS model.\")\n",
|
466 |
+
"else:\n",
|
467 |
+
" y_los = df_pd[TARGET_LOS].astype(float)\n",
|
468 |
+
" # The Gamma objective requires positive target values.\n",
|
469 |
+
" if (y_los <= 0).any():\n",
|
470 |
+
" print(f\"Warning: Found {(y_los <= 0).sum()} non-positive values in '{TARGET_LOS}'. Clamping to a small positive number.\")\n",
|
471 |
+
" y_los = y_los.clip(lower=0.001)\n",
|
472 |
+
"\n",
|
473 |
+
" X_train_los, X_test_los, y_train_los, y_test_los = train_test_split(X, y_los, test_size=0.2, random_state=42)\n",
|
474 |
+
" X_train_fs_los, X_val_fs_los, y_train_fs_los, y_val_fs_los = train_test_split(X_train_los, y_train_los, test_size=0.25, random_state=42)\n",
|
475 |
+
"\n",
|
476 |
+
" base_los_model = xgb.XGBRegressor(objective='reg:gamma', random_state=42, n_estimators=1000, learning_rate=0.05, max_depth=7, subsample=0.8, colsample_bytree=0.8, eval_metric='gamma-deviance')\n",
|
477 |
+
" \n",
|
478 |
+
" # Determine the feature set to use.\n",
|
479 |
+
" if FAST_MODE:\n",
|
480 |
+
" print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
|
481 |
+
" best_los_features = X.columns.tolist()\n",
|
482 |
+
" else:\n",
|
483 |
+
" best_los_features = find_best_feature_subset(\n",
|
484 |
+
" model=base_los_model,\n",
|
485 |
+
" X_train=X_train_fs_los, y_train=y_train_fs_los,\n",
|
486 |
+
" X_val=X_val_fs_los, y_val=y_val_fs_los,\n",
|
487 |
+
" scoring_func=mean_squared_error,\n",
|
488 |
+
" higher_is_better=False,\n",
|
489 |
+
" model_name=\"Length of Stay (XGBoost)\"\n",
|
490 |
+
" )\n",
|
491 |
+
"\n",
|
492 |
+
" print(f\"\\nTraining final LOS model using {len(best_los_features)} features...\")\n",
|
493 |
+
" los_model = clone(base_los_model)\n",
|
494 |
+
" los_model.set_params(early_stopping_rounds=50)\n",
|
495 |
+
" eval_set = [(X_val_fs_los[best_los_features], y_val_fs_los)]\n",
|
496 |
+
" los_model.fit(X_train_los[best_los_features], y_train_los, eval_set=eval_set, verbose=False)\n",
|
497 |
+
" print(f\"Optimal number of trees found via early stopping: {los_model.best_iteration}\")\n",
|
498 |
+
"\n",
|
499 |
+
" y_pred_los = los_model.predict(X_test_los[best_los_features])\n",
|
500 |
+
"\n",
|
501 |
+
" print(\"\\nLOS Model - Test Set Evaluation:\")\n",
|
502 |
+
" los_metrics = calculate_regression_metrics(y_test_los, y_pred_los)\n",
|
503 |
+
" for k, v in los_metrics.items(): print(f\" {k}: {v:.4f}\")\n",
|
504 |
+
"\n",
|
505 |
+
" # Log metrics and feature importances.\n",
|
506 |
+
" los_model_name = f\"Inpatient_LOS_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
|
507 |
+
" log_model_metrics_to_snowflake(session, MODEL_RUN_ID, los_model_name, TARGET_LOS, los_metrics, \"Regression\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
|
508 |
+
" log_feature_importances_to_snowflake(session, los_model, best_los_features, MODEL_RUN_ID, los_model_name, TARGET_LOS, FEATURE_IMPORTANCE_TABLE_NAME)\n",
|
509 |
+
"\n",
|
510 |
+
"# --- 4.2 Model 2: Predicting Readmission (Binary Classification) ---\n",
|
511 |
+
"print(\"\\n\" + \"=\"*80)\n",
|
512 |
+
"print(\"--- Training Model 2: Calibrated Readmission Probability ---\")\n",
|
513 |
+
"print(\"=\"*80)\n",
|
514 |
+
"TARGET_READMISSION = 'readmission_numerator'\n",
|
515 |
+
"DENOMINATOR_COL = 'readmission_denominator'\n",
|
516 |
+
"calibrated_readmission_model, best_readmission_features = None, None\n",
|
517 |
+
"\n",
|
518 |
+
"if TARGET_READMISSION not in df_pd.columns or DENOMINATOR_COL not in df_pd.columns:\n",
|
519 |
+
" print(f\"Error: Required columns '{TARGET_READMISSION}' or '{DENOMINATOR_COL}' not found. Skipping Readmission model.\")\n",
|
520 |
+
"else:\n",
|
521 |
+
" # Filter data to only include encounters eligible for readmission.\n",
|
522 |
+
" readmission_filter_mask = df_pd[DENOMINATOR_COL] == 1\n",
|
523 |
+
" if readmission_filter_mask.sum() == 0:\n",
|
524 |
+
" print(\"Error: The filter condition resulted in zero encounters. Skipping Readmission model.\")\n",
|
525 |
+
" else:\n",
|
526 |
+
" print(f\"Filtering data for Readmission Model where '{DENOMINATOR_COL}' = 1 ({readmission_filter_mask.sum()} rows).\")\n",
|
527 |
+
" X_readmission = X.loc[readmission_filter_mask].reset_index(drop=True)\n",
|
528 |
+
" y_readmission = df_pd.loc[readmission_filter_mask, TARGET_READMISSION].reset_index(drop=True)\n",
|
529 |
+
" \n",
|
530 |
+
" # Split data: 60% base train, 20% calibration, 20% test\n",
|
531 |
+
" stratify_readmission = y_readmission if len(np.unique(y_readmission)) > 1 else None\n",
|
532 |
+
" X_train_full, X_test_read, y_train_full, y_test_read = train_test_split(X_readmission, y_readmission, test_size=0.2, random_state=42, stratify=stratify_readmission)\n",
|
533 |
+
" stratify_y_train_full = y_train_full if len(np.unique(y_train_full)) > 1 else None\n",
|
534 |
+
" X_train_base, X_calib_read, y_train_base, y_calib_read = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=stratify_y_train_full)\n",
|
535 |
+
" print(f\"Data split for readmission: Base train: {X_train_base.shape[0]}, Calibration: {X_calib_read.shape[0]}, Test: {X_test_read.shape[0]}\")\n",
|
536 |
+
" \n",
|
537 |
+
" base_readmit_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, solver='liblinear')\n",
|
538 |
+
" \n",
|
539 |
+
" # Determine the feature set to use.\n",
|
540 |
+
" if FAST_MODE:\n",
|
541 |
+
" print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
|
542 |
+
" best_readmission_features = X_train_base.columns.tolist()\n",
|
543 |
+
" else:\n",
|
544 |
+
" best_readmission_features = find_best_feature_subset(\n",
|
545 |
+
" model=base_readmit_model, X_train=X_train_base, y_train=y_train_base, X_val=X_calib_read, y_val=y_calib_read,\n",
|
546 |
+
" scoring_func=roc_auc_score, higher_is_better=True, model_name=\"Readmission (Logistic Regression)\"\n",
|
547 |
+
" )\n",
|
548 |
+
"\n",
|
549 |
+
" print(f\"\\nTraining final Readmission model pipeline using {len(best_readmission_features)} features...\")\n",
|
550 |
+
" base_model_for_calib = clone(base_readmit_model)\n",
|
551 |
+
" base_model_for_calib.fit(X_train_base[best_readmission_features], y_train_base)\n",
|
552 |
+
" \n",
|
553 |
+
" # Log feature importances from the base (uncalibrated) model.\n",
|
554 |
+
" uncal_read_model_name = f\"Inpatient_Readmission_Base_Uncalibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
|
555 |
+
" log_feature_importances_to_snowflake(session, base_model_for_calib, best_readmission_features, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION, FEATURE_IMPORTANCE_TABLE_NAME)\n",
|
556 |
+
" \n",
|
557 |
+
" # Evaluate and log metrics for the uncalibrated model for comparison.\n",
|
558 |
+
" y_pred_proba_uncal = base_model_for_calib.predict_proba(X_test_read[best_readmission_features])[:, 1]\n",
|
559 |
+
" uncalibrated_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_uncal)\n",
|
560 |
+
" log_model_metrics_to_snowflake(session, MODEL_RUN_ID, uncal_read_model_name, TARGET_READMISSION + \"_Probability\", uncalibrated_metrics, \"Binary_Uncalibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
|
561 |
+
" \n",
|
562 |
+
" # Calibrate the model on the held-out calibration set.\n",
|
563 |
+
" calibrated_readmission_model = CalibratedClassifierCV(base_model_for_calib, method='isotonic', cv='prefit')\n",
|
564 |
+
" calibrated_readmission_model.fit(X_calib_read[best_readmission_features], y_calib_read)\n",
|
565 |
+
" y_pred_proba_cal = calibrated_readmission_model.predict_proba(X_test_read[best_readmission_features])[:, 1]\n",
|
566 |
+
"\n",
|
567 |
+
" print(\"\\nCalibrated Readmission Model - Test Set Evaluation:\")\n",
|
568 |
+
" calibrated_proba_metrics = calculate_binary_classification_proba_metrics(y_test_read, y_pred_proba_cal)\n",
|
569 |
+
" for k, v in calibrated_proba_metrics.items(): print(f\" {k}: {v:.4f}\")\n",
|
570 |
+
" \n",
|
571 |
+
" cal_read_model_name = f\"Inpatient_Readmission_Calibrated_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
|
572 |
+
" log_model_metrics_to_snowflake(session, MODEL_RUN_ID, cal_read_model_name, TARGET_READMISSION + \"_Probability\", calibrated_proba_metrics, \"Binary_Calibrated\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
|
573 |
+
"\n",
|
574 |
+
"# --- 4.3 Model 3: Predicting Discharge Location (Multiclass Classification) ---\n",
|
575 |
+
"print(\"\\n\" + \"=\"*80)\n",
|
576 |
+
"print(\"--- Training Model 3: Calibrated Discharge Location ---\")\n",
|
577 |
+
"print(\"=\"*80)\n",
|
578 |
+
"TARGET_DISCHARGE = 'discharge_location'\n",
|
579 |
+
"calibrated_discharge_model, le_discharge, best_discharge_features = None, None, None\n",
|
580 |
+
"\n",
|
581 |
+
"if TARGET_DISCHARGE not in df_pd.columns:\n",
|
582 |
+
" print(f\"Error: Target column '{TARGET_DISCHARGE}' not found. Skipping Discharge Location model.\")\n",
|
583 |
+
"else:\n",
|
584 |
+
" le_discharge = LabelEncoder()\n",
|
585 |
+
" y_discharge_encoded = le_discharge.fit_transform(df_pd[TARGET_DISCHARGE])\n",
|
586 |
+
" num_classes_discharge = len(le_discharge.classes_)\n",
|
587 |
+
" print(f\"Discharge Location: {num_classes_discharge} classes found: {le_discharge.classes_}\")\n",
|
588 |
+
" \n",
|
589 |
+
" # Split data: 60% base train, 20% calibration, 20% test\n",
|
590 |
+
" stratify_discharge = y_discharge_encoded if num_classes_discharge > 1 else None\n",
|
591 |
+
" X_train_full_disc, X_test_disc, y_train_full_disc_enc, y_test_disc_enc = train_test_split(X, y_discharge_encoded, test_size=0.2, random_state=42, stratify=stratify_discharge)\n",
|
592 |
+
" X_train_base_disc, X_calib_disc, y_train_base_disc_enc, y_calib_disc_enc = train_test_split(X_train_full_disc, y_train_full_disc_enc, test_size=0.25, random_state=42, stratify=y_train_full_disc_enc if num_classes_discharge > 1 else None)\n",
|
593 |
+
" print(f\"Data split for discharge: Base train: {X_train_base_disc.shape[0]}, Calibration: {X_calib_disc.shape[0]}, Test: {X_test_disc.shape[0]}\")\n",
|
594 |
+
" \n",
|
595 |
+
" base_discharge_model = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', multi_class='multinomial', class_weight='balanced')\n",
|
596 |
+
" \n",
|
597 |
+
" # Determine the feature set to use.\n",
|
598 |
+
" if FAST_MODE:\n",
|
599 |
+
" print(\"\\n[FAST MODE] Skipping feature selection. Using all available features.\")\n",
|
600 |
+
" best_discharge_features = X_train_base_disc.columns.tolist()\n",
|
601 |
+
" else:\n",
|
602 |
+
" best_discharge_features = find_best_feature_subset(\n",
|
603 |
+
" model=base_discharge_model, X_train=X_train_base_disc, y_train=y_train_base_disc_enc, X_val=X_calib_disc, y_val=y_calib_disc_enc,\n",
|
604 |
+
" scoring_func=log_loss, higher_is_better=False, model_name=\"Discharge Location (Multinomial Regression)\"\n",
|
605 |
+
" )\n",
|
606 |
+
"\n",
|
607 |
+
" print(f\"\\nTraining final Discharge Location model pipeline using {len(best_discharge_features)} features...\")\n",
|
608 |
+
" base_model_for_calib_disc = clone(base_discharge_model)\n",
|
609 |
+
" base_model_for_calib_disc.fit(X_train_base_disc[best_discharge_features], y_train_base_disc_enc)\n",
|
610 |
+
" \n",
|
611 |
+
" discharge_model_name = f\"Inpatient_Discharge_Cal_Overall_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
|
612 |
+
" log_feature_importances_to_snowflake(session, base_model_for_calib_disc, best_discharge_features, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, FEATURE_IMPORTANCE_TABLE_NAME)\n",
|
613 |
+
" \n",
|
614 |
+
" # Calibrate the model. 'sigmoid' is used for one-vs-rest calibration, suitable for multiclass.\n",
|
615 |
+
" calibrated_discharge_model = CalibratedClassifierCV(base_model_for_calib_disc, method='sigmoid', cv='prefit')\n",
|
616 |
+
" calibrated_discharge_model.fit(X_calib_disc[best_discharge_features], y_calib_disc_enc)\n",
|
617 |
+
" y_pred_proba_discharge_calibrated = calibrated_discharge_model.predict_proba(X_test_disc[best_discharge_features])\n",
|
618 |
+
" y_pred_labels_discharge_calibrated = calibrated_discharge_model.predict(X_test_disc[best_discharge_features])\n",
|
619 |
+
" \n",
|
620 |
+
" print(\"\\nCalibrated Discharge Model - Test Set Evaluation:\")\n",
|
621 |
+
" calibrated_disc_metrics = calculate_multiclass_classification_metrics(y_test_disc_enc, y_pred_labels_discharge_calibrated, y_pred_proba_discharge_calibrated, le_discharge.classes_)\n",
|
622 |
+
" \n",
|
623 |
+
" # Log the overall multiclass metrics.\n",
|
624 |
+
" overall_cal_metrics_to_log = {k: v for k, v in calibrated_disc_metrics.items() if k != 'per_class_details'}\n",
|
625 |
+
" overall_cal_metrics_to_log['BRIER_SCORE'] = calibrated_disc_metrics.get('BRIER_SCORE_MACRO_AVG')\n",
|
626 |
+
" log_model_metrics_to_snowflake(session, MODEL_RUN_ID, discharge_model_name, TARGET_DISCHARGE, overall_cal_metrics_to_log, \"Multiclass_Cal_Overall\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG)\n",
|
627 |
+
" \n",
|
628 |
+
" # --- FIX: Log the per-class metrics by mapping keys correctly ---\n",
|
629 |
+
" discharge_class_model_name = f\"Inpatient_Discharge_Cal_Class_{MODEL_NAME_SUFFIX}{EXCLUSION_SUFFIX}\"\n",
|
630 |
+
" for class_detail in calibrated_disc_metrics.get('per_class_details', []):\n",
|
631 |
+
" # Create a new dict with keys the logging function expects.\n",
|
632 |
+
" per_class_metrics_to_log = {\n",
|
633 |
+
" 'BRIER_SCORE': class_detail.get('brier_score'),\n",
|
634 |
+
" 'AVG_Y_PRED': class_detail.get('avg_pred_proba'),\n",
|
635 |
+
" 'AVG_Y_TRUE': class_detail.get('true_proportion'),\n",
|
636 |
+
" 'PRED_RATIO': class_detail.get('proba_ratio'),\n",
|
637 |
+
" }\n",
|
638 |
+
" log_model_metrics_to_snowflake(\n",
|
639 |
+
" session, MODEL_RUN_ID, discharge_class_model_name,\n",
|
640 |
+
" f\"{TARGET_DISCHARGE}_Class_{class_detail['class_name']}\",\n",
|
641 |
+
" per_class_metrics_to_log, # Use the correctly mapped dictionary\n",
|
642 |
+
" \"Multiclass_Cal_ClassDetail\", METRICS_TABLE_NAME, MODEL_SOURCE_TAG, MODEL_YEAR_TAG\n",
|
643 |
+
" )\n",
|
644 |
+
"\n",
|
645 |
+
" print(\"\\nCalibrated Classification Report:\\n\", classification_report(y_test_disc_enc, y_pred_labels_discharge_calibrated, target_names=le_discharge.classes_.astype(str), zero_division=0, digits=4))\n",
|
646 |
+
"\n",
|
647 |
+
"\n",
|
648 |
+
"# =============================================================================\n",
|
649 |
+
"# 5. MODEL SAVING\n",
|
650 |
+
"# =============================================================================\n",
|
651 |
+
"print(\"\\n\" + \"=\"*80)\n",
|
652 |
+
"print(\"--- Saving Models and Artifacts ---\")\n",
|
653 |
+
"print(\"=\"*80)\n",
|
654 |
+
"\n",
|
655 |
+
"# Bundle all necessary objects for deployment into a single dictionary.\n",
|
656 |
+
"inpatient_models_bundle = {\n",
|
657 |
+
" 'los_model': los_model,\n",
|
658 |
+
" 'readmission_model': calibrated_readmission_model,\n",
|
659 |
+
" 'discharge_model': calibrated_discharge_model,\n",
|
660 |
+
" 'feature_columns_los': best_los_features,\n",
|
661 |
+
" 'feature_columns_readmission': best_readmission_features,\n",
|
662 |
+
" 'feature_columns_discharge': best_discharge_features,\n",
|
663 |
+
" 'le_discharge': le_discharge,\n",
|
664 |
+
" 'model_run_id': MODEL_RUN_ID,\n",
|
665 |
+
" 'fast_mode': FAST_MODE,\n",
|
666 |
+
" 'excluded_feature_prefixes': EXCLUDE_FEATURE_PREFIXES\n",
|
667 |
+
"}\n",
|
668 |
+
"\n",
|
669 |
+
"# Create a descriptive file name for the bundle.\n",
|
670 |
+
"BUNDLE_SUFFIX = \"fast\" if FAST_MODE else \"fs\"\n",
|
671 |
+
"EXCLUSION_FILE_TAG = f\"_excl_{'-'.join([p.strip('_').lower() for p in EXCLUDE_FEATURE_PREFIXES])}\" if EXCLUDE_FEATURE_PREFIXES else \"\"\n",
|
672 |
+
"BUNDLE_FILE_NAME = f'inpatient_models_bundle_{MODEL_SOURCE_TAG}_{MODEL_YEAR_TAG}_{BUNDLE_SUFFIX}{EXCLUSION_FILE_TAG}.pkl'\n",
|
673 |
+
"\n",
|
674 |
+
"# Save the bundle locally using pickle.\n",
|
675 |
+
"with open(BUNDLE_FILE_NAME, 'wb') as f:\n",
|
676 |
+
" pickle.dump(inpatient_models_bundle, f)\n",
|
677 |
+
"print(f\"Models bundled and saved locally to: {BUNDLE_FILE_NAME}\")\n",
|
678 |
+
"\n",
|
679 |
+
"# Upload the local bundle file to the specified Snowflake stage.\n",
|
680 |
+
"put_result = session.file.put(BUNDLE_FILE_NAME, SNOWFLAKE_STAGE_NAME, overwrite=True)\n",
|
681 |
+
"if put_result[0].status == 'UPLOADED':\n",
|
682 |
+
" print(f\"Model bundle successfully uploaded to Snowflake stage: {SNOWFLAKE_STAGE_NAME}\")\n",
|
683 |
+
"else:\n",
|
684 |
+
" print(f\"Error uploading model bundle. Status: {put_result[0].status}, Message: {put_result[0].message}\")\n",
|
685 |
+
"\n",
|
686 |
+
"file_size_mb = os.path.getsize(BUNDLE_FILE_NAME) / (1024 * 1024)\n",
|
687 |
+
"print(f\"Saved local bundle file size: {file_size_mb:.2f} MB\")\n",
|
688 |
+
"\n",
|
689 |
+
"print(f\"\\n✅ Script finished ({'FAST MODE' if FAST_MODE else 'FULL MODE'}).\")"
|
690 |
+
]
|
691 |
+
}
|
692 |
+
],
|
693 |
"metadata": {
|
694 |
"kernelspec": {
|
695 |
"display_name": "Streamlit Notebook",
|
696 |
"name": "streamlit"
|
697 |
},
|
698 |
"lastEditStatus": {
|
699 |
+
"authorEmail": "[email protected]",
|
700 |
"authorId": "374530764978",
|
701 |
"authorName": "BRAD",
|
702 |
+
"lastEditTime": 1750870004305,
|
703 |
+
"notebookId": "6rovstl42ft2p5id6gwo",
|
704 |
+
"sessionId": "65561efa-4d18-4072-8f4d-10240cb902ba"
|
705 |
}
|
706 |
},
|
|
|
707 |
"nbformat": 4,
|
708 |
+
"nbformat_minor": 5
|
709 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inpatient_feature_importance.csv
CHANGED
@@ -2091,4 +2091,4 @@ MODEL_RUN_ID,MODEL_NAME,TARGET_NAME,FEATURE_NAME,IMPORTANCE_VALUE,IMPORTANCE_RAN
|
|
2091 |
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_85,0.04926689992,745,2025-06-18 21:11:35.095
|
2092 |
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_107,0.03908581065,746,2025-06-18 21:11:35.095
|
2093 |
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_108,0.03781863024,747,2025-06-18 21:11:35.095
|
2094 |
-
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,cms_ischemic_heart_disease,0.02849259634,748,2025-06-18 21:11:35.095
|
|
|
2091 |
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_85,0.04926689992,745,2025-06-18 21:11:35.095
|
2092 |
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_107,0.03908581065,746,2025-06-18 21:11:35.095
|
2093 |
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,hcc_108,0.03781863024,747,2025-06-18 21:11:35.095
|
2094 |
+
03daf6f5-4a7a-44b9-a670-e0520ec6772f,Inpatient_Discharge_Cal_Overall_FeatureSelected,discharge_location,cms_ischemic_heart_disease,0.02849259634,748,2025-06-18 21:11:35.095
|