OsamaMIT commited on
Commit
7da09ff
Β·
verified Β·
1 Parent(s): 927eb52

Initial commit

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. app.py +198 -0
  3. card_transdata.csv +3 -0
  4. model.py +88 -0
  5. prompts.md +59 -0
  6. reason.py +104 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ card_transdata.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from reason import assess_fraud
3
+
4
+ css = """
5
+ .app-title {
6
+ margin: 1rem auto;
7
+ text-align: center;
8
+ }
9
+
10
+ .outer-container {
11
+ gap: 3rem;
12
+ }
13
+
14
+ .main-col-one, .main-col-two {
15
+ gap: 3rem;
16
+ }
17
+
18
+ .input-elem-row {
19
+ align-items: center;
20
+ gap: 2rem;
21
+ }
22
+
23
+ .input-elem-header p {
24
+ font-weight: 700;
25
+ font-size: 1.15rem;
26
+ }
27
+
28
+ .input-elem-desc p {
29
+ font-size: 0.9rem;
30
+ opacity: 0.6;
31
+ }
32
+
33
+ .input-elem-col-one {
34
+ gap: 0;
35
+ }
36
+
37
+ .custom-input-elem-one span {
38
+ display: none;
39
+ }
40
+
41
+ .custom-input-elem-one input {
42
+ border-radius: 6px !important;
43
+ }
44
+
45
+ .custom-input-elem-one input::-webkit-outer-spin-button,
46
+ .custom-input-elem-one input::-webkit-inner-spin-button {
47
+ -webkit-appearance: none;
48
+ margin: 0;
49
+ }
50
+
51
+ .custom-input-elem-one input[type=number] {
52
+ -moz-appearance: textfield;
53
+ }
54
+
55
+ div:has(.custom-input-elem-one), div:has(.custom-input-two), .custom-input-elem-one, .custom-input-two {
56
+ padding: 0;
57
+ margin: 0;
58
+ border: none;
59
+ background: none;
60
+ }
61
+
62
+ .custom-input-two {
63
+ display: flex;
64
+ justify-content: center;
65
+ }
66
+
67
+ .custom-input-two input[type=checkbox] {
68
+ height: 1.5rem;
69
+ width: 1.5rem;
70
+ border-width: 2px;
71
+ }
72
+
73
+ .button-row {
74
+ margin: 3rem auto;
75
+ }
76
+
77
+ .fraud-button {
78
+ font-weight: 700;
79
+ border: none;
80
+ padding: 0.5rem 1rem;
81
+ border-radius: 10px;
82
+ font-size: 1.15rem;
83
+ width: 100%;
84
+ max-width: 500px;
85
+ display: block;
86
+ margin: 0 auto;
87
+ transition: 0.3s ease;
88
+ }
89
+
90
+ .fraud-button:hover,
91
+ .fraud-button:focus {
92
+ outline: none;
93
+ box-shadow: rgba(100, 100, 111, 0.2) 0px 7px 29px 0px;
94
+ }
95
+
96
+ @media screen and (max-width: 600px) {
97
+ .fraud-button {
98
+ width: 100%;
99
+ max-width: 100%;
100
+ }
101
+
102
+ .custom-input-two {
103
+ justify-content: flex-start !important;
104
+ }
105
+ }
106
+
107
+ .output-box textarea {
108
+ font-size: 1rem;
109
+ }
110
+
111
+ """
112
+
113
+
114
+ with gr.Blocks(theme=gr.themes.Base(font=[gr.themes.GoogleFont("Rubik"), "Arial", "sans-serif"]), css=css) as demo:
115
+ gr.Markdown("# AI-Powered Fraud Detection for Merchants & Analysts", elem_classes="app-title")
116
+ with gr.Row(elem_classes="outer-container"):
117
+ with gr.Column(elem_classes="main-col-one"):
118
+ with gr.Row(elem_classes="input-elem-row"):
119
+ with gr.Column(elem_classes="input-elem-col-one"):
120
+ gr.Markdown("Transaction Amount ($)", elem_classes="input-elem-header")
121
+ gr.Markdown("The total amount of the transaction in US dollars", elem_classes="input-elem-desc")
122
+ with gr.Column():
123
+ transactionAmount = gr.Number(value=None, elem_classes="custom-input-elem-one")
124
+
125
+ with gr.Row(elem_classes="input-elem-row"):
126
+ with gr.Column(elem_classes="input-elem-col-one"):
127
+ gr.Markdown("Customer Median Spend ($)", elem_classes="input-elem-header")
128
+ gr.Markdown("This customer’s typical (median) purchase amount. Used to detect unusual spending.", elem_classes="input-elem-desc")
129
+ with gr.Column():
130
+ customerMedianSpend = gr.Number(value=None, elem_classes="custom-input-elem-one")
131
+
132
+ with gr.Row(elem_classes="input-elem-row"):
133
+ with gr.Column(elem_classes="input-elem-col-one"):
134
+ gr.Markdown("Distance From Home (km)", elem_classes="input-elem-header")
135
+ gr.Markdown("How far the customer was from their registered address when the transaction occurred.", elem_classes="input-elem-desc")
136
+ with gr.Column():
137
+ distanceFromHome = gr.Number(value=None, elem_classes="custom-input-elem-one")
138
+
139
+ with gr.Row(elem_classes="input-elem-row"):
140
+ with gr.Column(elem_classes="input-elem-col-one"):
141
+ gr.Markdown("Distance From Last Transaction (km)", elem_classes="input-elem-header")
142
+ gr.Markdown("Distance between this transaction and the customer's previous one, in kilometers. Helps detect impossible travel.", elem_classes="input-elem-desc")
143
+ with gr.Column():
144
+ distanceFromLastTransaction = gr.Number(value=None, elem_classes="custom-input-elem-one")
145
+
146
+
147
+ with gr.Column(elem_classes="main-col-two"):
148
+ with gr.Row(elem_classes="input-elem-row"):
149
+ with gr.Column(elem_classes="input-elem-col-one"):
150
+ gr.Markdown("Repeat Retailer", elem_classes="input-elem-header")
151
+ gr.Markdown("Has the customer made purchases from this merchant before?", elem_classes="input-elem-desc")
152
+ with gr.Column():
153
+ repeatRetailer = gr.Checkbox(label="", elem_classes="custom-input-two")
154
+
155
+ with gr.Row(elem_classes="input-elem-row"):
156
+ with gr.Column(elem_classes="input-elem-col-one"):
157
+ gr.Markdown("Used Chip", elem_classes="input-elem-header")
158
+ gr.Markdown("Was the transaction done using the credit card's chip (EMV) instead of swipe or manual entry?", elem_classes="input-elem-desc")
159
+ with gr.Column():
160
+ usedChip = gr.Checkbox(label="", elem_classes="custom-input-two")
161
+
162
+ with gr.Row(elem_classes="input-elem-row"):
163
+ with gr.Column(elem_classes="input-elem-col-one"):
164
+ gr.Markdown("Used PIN", elem_classes="input-elem-header")
165
+ gr.Markdown("Was a PIN number entered during the transaction?", elem_classes="input-elem-desc")
166
+ with gr.Column():
167
+ usedPin = gr.Checkbox(label="", elem_classes="custom-input-two")
168
+
169
+ with gr.Row(elem_classes="input-elem-row"):
170
+ with gr.Column(elem_classes="input-elem-col-one"):
171
+ gr.Markdown("Online Order", elem_classes="input-elem-header")
172
+ gr.Markdown("Was this transaction placed through an online store (e.g. e-commerce, app)?", elem_classes="input-elem-desc")
173
+ with gr.Column():
174
+ onlineOrder = gr.Checkbox(label="", elem_classes="custom-input-two")
175
+
176
+ with gr.Row(elem_classes="button-row"):
177
+ checkFraud = gr.Button("Check for Fraud", elem_classes="fraud-button")
178
+
179
+ with gr.Row():
180
+ output_box = gr.Textbox(label="Output", lines=3, elem_classes="output-box")
181
+ checkFraud.click(
182
+ fn=assess_fraud,
183
+ inputs=[
184
+ transactionAmount,
185
+ customerMedianSpend,
186
+ distanceFromHome,
187
+ distanceFromLastTransaction,
188
+ repeatRetailer,
189
+ usedChip,
190
+ usedPin,
191
+ onlineOrder
192
+ ],
193
+ outputs=output_box
194
+ )
195
+
196
+
197
+ if __name__ == "__main__":
198
+ demo.launch()
card_transdata.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7013c329bae9ef0ef32d65dbeb095694f0c7cd6c00ff74b2d0087fa1c67b8717
3
+ size 76277977
model.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import HistGradientBoostingClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import classification_report, confusion_matrix
5
+ import joblib
6
+ from lime.lime_tabular import LimeTabularExplainer
7
+
8
+ # Load data
9
+ data = pd.read_csv('src/card_transdata.csv')
10
+
11
+ # Features and target
12
+ X = data.drop(columns=['fraud'])
13
+ y = data['fraud']
14
+
15
+ # Train/test split
16
+ X_train, X_test, y_train, y_test = train_test_split(
17
+ X, y,
18
+ test_size=0.3,
19
+ stratify=y,
20
+ random_state=42
21
+ )
22
+
23
+ # Initialize a gradient-boosting classifier with class imbalance handling
24
+ model = HistGradientBoostingClassifier(
25
+ loss="log_loss",
26
+ class_weight="balanced",
27
+ learning_rate=0.05,
28
+ max_iter=200,
29
+ max_depth=8,
30
+ random_state=42
31
+ )
32
+
33
+ # Train on the training set
34
+ model.fit(X_train, y_train)
35
+
36
+ # Predict on the test set
37
+ y_pred = model.predict(X_test)
38
+
39
+ """Comment out the following lines to skip evaluation in prod"""
40
+ # print("\nTesting with all transactions...")
41
+
42
+ # # Evaluate
43
+ # print("Classification Report:")
44
+ # print(classification_report(y_test, y_pred, digits=4))
45
+
46
+ # print("\nConfusion Matrix:")
47
+ # print(confusion_matrix(y_test, y_pred))
48
+
49
+
50
+ # # Save and load the model using joblib
51
+ def save_model(model, filename='fraud_model.pkl'):
52
+ """Saves the trained model to a file."""
53
+ joblib.dump(model, filename)
54
+ #print(f"Model saved to {filename}")
55
+
56
+ save_model(model)
57
+
58
+ def load_model(filename='fraud_model.pkl'):
59
+ """Loads the saved model from a file."""
60
+ model = joblib.load(filename)
61
+ #print(f"Model loaded from {filename}")
62
+ return model
63
+
64
+
65
+ # Initialize LIME explainer on training data
66
+ explainer = LimeTabularExplainer(
67
+ training_data=X_train.values,
68
+ feature_names=X_train.columns.tolist(),
69
+ class_names=['not_fraud', 'fraud'],
70
+ mode='classification'
71
+ )
72
+
73
+ def extract_top_features(single_row_df, top_n=3):
74
+ # Generate explanation for the 'fraud' class (label=1)
75
+ exp = explainer.explain_instance(
76
+ single_row_df.values[0],
77
+ lambda arr: model.predict_proba(
78
+ pd.DataFrame(arr, columns=X_train.columns.tolist())
79
+ ),
80
+ num_features=top_n
81
+ )
82
+
83
+ # Get list of (feature, weight) for the fraud prediction
84
+ feature_weights = exp.as_list(label=1)
85
+ # Format the top features into a string
86
+ formatted = "Transaction's top features:\n"
87
+ formatted += "\n".join(f" - {feat}: weight {weight:.4f}" for feat, weight in feature_weights)
88
+ return formatted
prompts.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fraud Detection Prompts
2
+
3
+
4
+ ### Placeholder Prompt
5
+ ```
6
+ f"Transaction classified as **{status}**.\n"
7
+ f"Top contributing factors according the LIME module:\n{feat_str}\n\n"
8
+ "Please explain why and recommend next investigative steps."
9
+ ```
10
+
11
+ ### Prompt V1
12
+ ```
13
+ "You are a professional fraud analyst assisting in reviewing a flagged transaction.\n"
14
+ f"The transaction is classified as **{status}**.\n"
15
+ f"The top contributing factors according to the LIME module:\n{feat_str}\n\n"
16
+ "Briefly explain why this transaction was flagged as such based on the top contributing features.\n"
17
+ "Assess the likelihood of fraud based on the features and their influence\n"
18
+ "Recommend next investigative steps that a business user or fraud team should take.\n"
19
+ "Respond in a formal but concise tone. Your explanation should be understandable to both technical and non-technical users.\n"
20
+ ```
21
+
22
+ ### Prompt V2
23
+ *implements chain of thought lightly*
24
+ ```
25
+ "You are a professional fraud analyst assisting in reviewing a flagged transaction.\n"
26
+ f"The transaction is classified as **{status}**.\n"
27
+ f"The top contributing factors according to the LIME module:\n{feat_str}\n\n"
28
+ "Think step-by-step through the features and their weights to understand the model's reasoning.\n"
29
+ "Then:\n"
30
+ "Briefly explain why this transaction was flagged as such based on the top contributing features.\n"
31
+ "Assess the likelihood of fraud based on the features and their influence\n"
32
+ "Recommend next investigative steps that a business user or fraud team should take.\n"
33
+ "Respond in a formal but concise tone. Your explanation should be understandable to both technical and non-technical users.\n"
34
+ ```
35
+
36
+ ### Explicit Chain of Thought
37
+ *add this to the prompt to perform verbose reasoning before making a decision*
38
+ ```
39
+ "Walk through your reasoning step-by-step before reaching your conclusions. Show how each feature contributes to your fraud assessment.\n"
40
+ ```
41
+
42
+ ### Optional Guidance For Output Formatting
43
+ *add this to the prompt for formatting the output from the LLM*
44
+ ```
45
+ Format your response using **Markdown** as follows:
46
+
47
+ **Prediction**: FRAUD
48
+ **Likelihood of Fraud**: (Low / Moderate / High)
49
+
50
+ **Reasoning**:
51
+ - Bullet point 1
52
+ - Bullet point 2
53
+ - Bullet point 3
54
+
55
+ **Recommended Next Steps**:
56
+ - Step 1
57
+ - Step 2
58
+ - Step 3
59
+ ```
reason.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ import pandas as pd
3
+ from model import load_model, extract_top_features
4
+
5
+ # load fraud classifier from src/model.py
6
+ fraud_model = None
7
+ def get_fraud_model():
8
+ global fraud_model
9
+ if fraud_model is None:
10
+ fraud_model = load_model()
11
+ return fraud_model
12
+
13
+
14
+ # initialize the LLM and tokenizer
15
+ # using the model from Hugging Face
16
+ llm = Llama.from_pretrained(
17
+ repo_id="lmstudio-community/Nemotron-Research-Reasoning-Qwen-1.5B-GGUF",
18
+ filename="Nemotron-Research-Reasoning-Qwen-1.5B-Q4_K_M.gguf",
19
+ verbose=False,
20
+ n_ctx=131072, # Match training context length
21
+ #n_gpu_layers=24 ## Optional for GPU acceleration
22
+ )
23
+
24
+ # runs the LLM reasoning
25
+ def llm_reason(prompt: str) -> str:
26
+ output = llm.create_chat_completion(
27
+ messages = [
28
+ {
29
+ "role": "user",
30
+ "content": prompt
31
+ }
32
+ ]
33
+ )
34
+
35
+ return output["choices"][0]["message"]["content"]
36
+
37
+ # wrapper to build context and call the LLM
38
+ def build_and_call_llm(transaction_df: pd.DataFrame) -> str:
39
+ # 1) get a fraud prediction + top features
40
+ model = get_fraud_model()
41
+ pred = model.predict(transaction_df)[0]
42
+ feature_contributions = extract_top_features(transaction_df, top_n=3)
43
+
44
+ # 2) assemble a minimal prompt temporarily
45
+ status = "FRAUD" if pred == 1 else "NORMAL"
46
+ prompt = (
47
+ "You are a professional fraud analyst assisting in reviewing a flagged transaction.\n"
48
+ f"The transaction is classified as **{status}**.\n"
49
+ f"The top contributing factors and their weights:\n{feature_contributions}\n\n"
50
+ "Think step-by-step through the features and their weights to understand the classifier's reasoning.\n"
51
+ "Then:\n"
52
+ "Explain why this transaction was flagged by each conrtributing factor, explaining why and how the feature contributes to the classification.\n"
53
+ "Assess the likelihood of fraud based on the features and their influence, including why their influence is drastic in terms of cause and effect.\n"
54
+ "Recommend the specific and impactful next investigative steps that a business user or fraud team should take (with ample detail) in real life, independent of the features.\n"
55
+ "Respond in a formal and explainatory tone (don't be too concise). Your explanation should be understandable to both technical and non-technical users.\n"
56
+ " Format your response using **Markdown** as follows:\n"
57
+ "\n"
58
+ " **Prediction**: FRAUD \n"
59
+ " **Likelihood of Fraud**: (Low / Moderate / High) \n"
60
+ "\n"
61
+ " **Reasoning**: \n"
62
+ " - Bullet point 1 \n"
63
+ " - Bullet point 2 \n"
64
+ " - Bullet point 3 \n"
65
+ "\n"
66
+ " **Recommended Next Steps**: \n"
67
+ " - Step 1 \n"
68
+ " - Step 2 \n"
69
+ " - Step 3\n"
70
+ )
71
+
72
+ return llm_reason(prompt)
73
+
74
+
75
+
76
+ # ─── ENTRYPOINT ───────────────────────────────────────────────────────────────—
77
+ def assess_fraud(distanceFromHome, distanceFromLastTransaction, transactionAmount, customerMedianSpend, repeatRetailer, usedChip, usedPin, onlineOrder):
78
+
79
+ data = {
80
+ "distance_from_home": distanceFromHome,
81
+ "distance_from_last_transaction": distanceFromLastTransaction,
82
+ "ratio_to_median_purchase_price": transactionAmount / customerMedianSpend, # Ratio of purchased price transaction to median purchase price
83
+ "repeat_retailer": float(repeatRetailer), # These variables are boolean and must be converted to float to match the training dataset
84
+ "used_chip": float(usedChip),
85
+ "used_pin_number": float(usedPin),
86
+ "online_order": float(onlineOrder),
87
+ }
88
+ df_row = pd.DataFrame([data])
89
+
90
+
91
+ # load data, build context, and await the LLM’s explanation
92
+ explanation = build_and_call_llm(df_row)
93
+
94
+ parts = explanation.split('</think>', 1)
95
+
96
+ if len(parts) > 1:
97
+ after_think = parts[1].strip()
98
+ return(after_think)
99
+ else:
100
+ return("No </think> tag found.")
101
+
102
+ # if __name__ == "__main__":
103
+ # df = pd.read_csv('src/card_transdata.csv').drop(columns=['fraud']).iloc[0:1] ## Data for testing
104
+ # assess_fraud(df)