Update model.py
Browse files
model.py
CHANGED
@@ -1,88 +1,88 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
from sklearn.ensemble import HistGradientBoostingClassifier
|
3 |
-
from sklearn.model_selection import train_test_split
|
4 |
-
from sklearn.metrics import classification_report, confusion_matrix
|
5 |
-
import joblib
|
6 |
-
from lime.lime_tabular import LimeTabularExplainer
|
7 |
-
|
8 |
-
# Load data
|
9 |
-
data = pd.read_csv('
|
10 |
-
|
11 |
-
# Features and target
|
12 |
-
X = data.drop(columns=['fraud'])
|
13 |
-
y = data['fraud']
|
14 |
-
|
15 |
-
# Train/test split
|
16 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
17 |
-
X, y,
|
18 |
-
test_size=0.3,
|
19 |
-
stratify=y,
|
20 |
-
random_state=42
|
21 |
-
)
|
22 |
-
|
23 |
-
# Initialize a gradient-boosting classifier with class imbalance handling
|
24 |
-
model = HistGradientBoostingClassifier(
|
25 |
-
loss="log_loss",
|
26 |
-
class_weight="balanced",
|
27 |
-
learning_rate=0.05,
|
28 |
-
max_iter=200,
|
29 |
-
max_depth=8,
|
30 |
-
random_state=42
|
31 |
-
)
|
32 |
-
|
33 |
-
# Train on the training set
|
34 |
-
model.fit(X_train, y_train)
|
35 |
-
|
36 |
-
# Predict on the test set
|
37 |
-
y_pred = model.predict(X_test)
|
38 |
-
|
39 |
-
"""Comment out the following lines to skip evaluation in prod"""
|
40 |
-
# print("\nTesting with all transactions...")
|
41 |
-
|
42 |
-
# # Evaluate
|
43 |
-
# print("Classification Report:")
|
44 |
-
# print(classification_report(y_test, y_pred, digits=4))
|
45 |
-
|
46 |
-
# print("\nConfusion Matrix:")
|
47 |
-
# print(confusion_matrix(y_test, y_pred))
|
48 |
-
|
49 |
-
|
50 |
-
# # Save and load the model using joblib
|
51 |
-
def save_model(model, filename='fraud_model.pkl'):
|
52 |
-
"""Saves the trained model to a file."""
|
53 |
-
joblib.dump(model, filename)
|
54 |
-
#print(f"Model saved to {filename}")
|
55 |
-
|
56 |
-
save_model(model)
|
57 |
-
|
58 |
-
def load_model(filename='fraud_model.pkl'):
|
59 |
-
"""Loads the saved model from a file."""
|
60 |
-
model = joblib.load(filename)
|
61 |
-
#print(f"Model loaded from {filename}")
|
62 |
-
return model
|
63 |
-
|
64 |
-
|
65 |
-
# Initialize LIME explainer on training data
|
66 |
-
explainer = LimeTabularExplainer(
|
67 |
-
training_data=X_train.values,
|
68 |
-
feature_names=X_train.columns.tolist(),
|
69 |
-
class_names=['not_fraud', 'fraud'],
|
70 |
-
mode='classification'
|
71 |
-
)
|
72 |
-
|
73 |
-
def extract_top_features(single_row_df, top_n=3):
|
74 |
-
# Generate explanation for the 'fraud' class (label=1)
|
75 |
-
exp = explainer.explain_instance(
|
76 |
-
single_row_df.values[0],
|
77 |
-
lambda arr: model.predict_proba(
|
78 |
-
pd.DataFrame(arr, columns=X_train.columns.tolist())
|
79 |
-
),
|
80 |
-
num_features=top_n
|
81 |
-
)
|
82 |
-
|
83 |
-
# Get list of (feature, weight) for the fraud prediction
|
84 |
-
feature_weights = exp.as_list(label=1)
|
85 |
-
# Format the top features into a string
|
86 |
-
formatted = "Transaction's top features:\n"
|
87 |
-
formatted += "\n".join(f" - {feat}: weight {weight:.4f}" for feat, weight in feature_weights)
|
88 |
return formatted
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.ensemble import HistGradientBoostingClassifier
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
5 |
+
import joblib
|
6 |
+
from lime.lime_tabular import LimeTabularExplainer
|
7 |
+
|
8 |
+
# Load data
|
9 |
+
data = pd.read_csv('card_transdata.csv')
|
10 |
+
|
11 |
+
# Features and target
|
12 |
+
X = data.drop(columns=['fraud'])
|
13 |
+
y = data['fraud']
|
14 |
+
|
15 |
+
# Train/test split
|
16 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
17 |
+
X, y,
|
18 |
+
test_size=0.3,
|
19 |
+
stratify=y,
|
20 |
+
random_state=42
|
21 |
+
)
|
22 |
+
|
23 |
+
# Initialize a gradient-boosting classifier with class imbalance handling
|
24 |
+
model = HistGradientBoostingClassifier(
|
25 |
+
loss="log_loss",
|
26 |
+
class_weight="balanced",
|
27 |
+
learning_rate=0.05,
|
28 |
+
max_iter=200,
|
29 |
+
max_depth=8,
|
30 |
+
random_state=42
|
31 |
+
)
|
32 |
+
|
33 |
+
# Train on the training set
|
34 |
+
model.fit(X_train, y_train)
|
35 |
+
|
36 |
+
# Predict on the test set
|
37 |
+
y_pred = model.predict(X_test)
|
38 |
+
|
39 |
+
"""Comment out the following lines to skip evaluation in prod"""
|
40 |
+
# print("\nTesting with all transactions...")
|
41 |
+
|
42 |
+
# # Evaluate
|
43 |
+
# print("Classification Report:")
|
44 |
+
# print(classification_report(y_test, y_pred, digits=4))
|
45 |
+
|
46 |
+
# print("\nConfusion Matrix:")
|
47 |
+
# print(confusion_matrix(y_test, y_pred))
|
48 |
+
|
49 |
+
|
50 |
+
# # Save and load the model using joblib
|
51 |
+
def save_model(model, filename='fraud_model.pkl'):
|
52 |
+
"""Saves the trained model to a file."""
|
53 |
+
joblib.dump(model, filename)
|
54 |
+
#print(f"Model saved to {filename}")
|
55 |
+
|
56 |
+
save_model(model)
|
57 |
+
|
58 |
+
def load_model(filename='fraud_model.pkl'):
|
59 |
+
"""Loads the saved model from a file."""
|
60 |
+
model = joblib.load(filename)
|
61 |
+
#print(f"Model loaded from {filename}")
|
62 |
+
return model
|
63 |
+
|
64 |
+
|
65 |
+
# Initialize LIME explainer on training data
|
66 |
+
explainer = LimeTabularExplainer(
|
67 |
+
training_data=X_train.values,
|
68 |
+
feature_names=X_train.columns.tolist(),
|
69 |
+
class_names=['not_fraud', 'fraud'],
|
70 |
+
mode='classification'
|
71 |
+
)
|
72 |
+
|
73 |
+
def extract_top_features(single_row_df, top_n=3):
|
74 |
+
# Generate explanation for the 'fraud' class (label=1)
|
75 |
+
exp = explainer.explain_instance(
|
76 |
+
single_row_df.values[0],
|
77 |
+
lambda arr: model.predict_proba(
|
78 |
+
pd.DataFrame(arr, columns=X_train.columns.tolist())
|
79 |
+
),
|
80 |
+
num_features=top_n
|
81 |
+
)
|
82 |
+
|
83 |
+
# Get list of (feature, weight) for the fraud prediction
|
84 |
+
feature_weights = exp.as_list(label=1)
|
85 |
+
# Format the top features into a string
|
86 |
+
formatted = "Transaction's top features:\n"
|
87 |
+
formatted += "\n".join(f" - {feat}: weight {weight:.4f}" for feat, weight in feature_weights)
|
88 |
return formatted
|