Spaces:

Agents-MCP-Hackathon
/

credit-card-fraud-detection

Sleeping

App Files Files Community

OsamaMIT commited on Jun 10

Commit

7da09ff

verified ·

1 Parent(s): 927eb52

Initial commit

Browse files

Files changed (6) hide show

.gitattributes +1 -0
app.py +198 -0
card_transdata.csv +3 -0
model.py +88 -0
prompts.md +59 -0
reason.py +104 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+card_transdata.csv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import gradio as gr
+from reason import assess_fraud
+css = """
+.app-title {
+    margin: 1rem auto;
+    text-align: center;
+}
+.outer-container {
+    gap: 3rem;
+}
+.main-col-one, .main-col-two {
+    gap: 3rem;
+}
+.input-elem-row {
+    align-items: center;
+    gap: 2rem;
+}
+.input-elem-header p {
+    font-weight: 700;
+    font-size: 1.15rem;
+}
+.input-elem-desc p {
+    font-size: 0.9rem;
+    opacity: 0.6;
+}
+.input-elem-col-one {
+    gap: 0;
+}
+.custom-input-elem-one span {
+    display: none;
+}
+.custom-input-elem-one input {
+    border-radius: 6px !important;
+}
+.custom-input-elem-one input::-webkit-outer-spin-button,
+.custom-input-elem-one input::-webkit-inner-spin-button {
+    -webkit-appearance: none;
+    margin: 0;
+}
+.custom-input-elem-one input[type=number] {
+    -moz-appearance: textfield;
+}
+div:has(.custom-input-elem-one), div:has(.custom-input-two), .custom-input-elem-one, .custom-input-two {
+    padding: 0;
+    margin: 0;
+    border: none;
+    background: none;
+}
+.custom-input-two {
+    display: flex;
+    justify-content: center;
+}
+.custom-input-two input[type=checkbox] {
+    height: 1.5rem;
+    width: 1.5rem;
+    border-width: 2px;
+}
+.button-row {
+    margin: 3rem auto;
+}
+.fraud-button {
+    font-weight: 700;
+    border: none;
+    padding: 0.5rem 1rem;
+    border-radius: 10px;
+    font-size: 1.15rem;
+    width: 100%;
+    max-width: 500px;
+    display: block;
+    margin: 0 auto;
+    transition: 0.3s ease;
+}
+.fraud-button:hover,
+.fraud-button:focus {
+    outline: none;
+    box-shadow: rgba(100, 100, 111, 0.2) 0px 7px 29px 0px;
+}
+@media screen and (max-width: 600px) {
+    .fraud-button {
+        width: 100%;
+        max-width: 100%;
+    }
+    .custom-input-two {
+        justify-content: flex-start !important;
+    }
+}
+.output-box textarea {
+    font-size: 1rem;
+}
+"""
+with gr.Blocks(theme=gr.themes.Base(font=[gr.themes.GoogleFont("Rubik"), "Arial", "sans-serif"]), css=css) as demo:
+    gr.Markdown("# AI-Powered Fraud Detection for Merchants & Analysts", elem_classes="app-title")
+    with gr.Row(elem_classes="outer-container"):
+        with gr.Column(elem_classes="main-col-one"):
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Transaction Amount ($)", elem_classes="input-elem-header")
+                    gr.Markdown("The total amount of the transaction in US dollars", elem_classes="input-elem-desc")
+                with gr.Column():
+                    transactionAmount = gr.Number(value=None, elem_classes="custom-input-elem-one")
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Customer Median Spend ($)", elem_classes="input-elem-header")
+                    gr.Markdown("This customer’s typical (median) purchase amount. Used to detect unusual spending.", elem_classes="input-elem-desc")
+                with gr.Column():
+                    customerMedianSpend = gr.Number(value=None, elem_classes="custom-input-elem-one")
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Distance From Home (km)", elem_classes="input-elem-header")
+                    gr.Markdown("How far the customer was from their registered address when the transaction occurred.", elem_classes="input-elem-desc")
+                with gr.Column():
+                    distanceFromHome = gr.Number(value=None, elem_classes="custom-input-elem-one")
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Distance From Last Transaction (km)", elem_classes="input-elem-header")
+                    gr.Markdown("Distance between this transaction and the customer's previous one, in kilometers. Helps detect impossible travel.", elem_classes="input-elem-desc")
+                with gr.Column():
+                    distanceFromLastTransaction = gr.Number(value=None, elem_classes="custom-input-elem-one")
+        with gr.Column(elem_classes="main-col-two"):
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Repeat Retailer", elem_classes="input-elem-header")
+                    gr.Markdown("Has the customer made purchases from this merchant before?", elem_classes="input-elem-desc")
+                with gr.Column():
+                    repeatRetailer = gr.Checkbox(label="", elem_classes="custom-input-two")
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Used Chip", elem_classes="input-elem-header")
+                    gr.Markdown("Was the transaction done using the credit card's chip (EMV) instead of swipe or manual entry?", elem_classes="input-elem-desc")
+                with gr.Column():
+                    usedChip = gr.Checkbox(label="", elem_classes="custom-input-two")
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Used PIN", elem_classes="input-elem-header")
+                    gr.Markdown("Was a PIN number entered during the transaction?", elem_classes="input-elem-desc")
+                with gr.Column():
+                    usedPin = gr.Checkbox(label="", elem_classes="custom-input-two")
+            with gr.Row(elem_classes="input-elem-row"):
+                with gr.Column(elem_classes="input-elem-col-one"):
+                    gr.Markdown("Online Order", elem_classes="input-elem-header")
+                    gr.Markdown("Was this transaction placed through an online store (e.g. e-commerce, app)?", elem_classes="input-elem-desc")
+                with gr.Column():
+                    onlineOrder = gr.Checkbox(label="", elem_classes="custom-input-two")
+    with gr.Row(elem_classes="button-row"):
+                checkFraud = gr.Button("Check for Fraud", elem_classes="fraud-button")
+    with gr.Row():
+        output_box = gr.Textbox(label="Output", lines=3, elem_classes="output-box")
+        checkFraud.click(
+                    fn=assess_fraud,
+                    inputs=[
+                        transactionAmount,
+                        customerMedianSpend,
+                        distanceFromHome,
+                        distanceFromLastTransaction,
+                        repeatRetailer,
+                        usedChip,
+                        usedPin,
+                        onlineOrder
+                    ],
+                    outputs=output_box
+        )
+if __name__ == "__main__":
+    demo.launch()

card_transdata.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7013c329bae9ef0ef32d65dbeb095694f0c7cd6c00ff74b2d0087fa1c67b8717
+size 76277977

model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import pandas as pd
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, confusion_matrix
+import joblib
+from lime.lime_tabular import LimeTabularExplainer
+# Load data
+data = pd.read_csv('src/card_transdata.csv')
+# Features and target
+X = data.drop(columns=['fraud'])
+y = data['fraud']
+# Train/test split
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y,
+    test_size=0.3,
+    stratify=y,
+    random_state=42
+)
+# Initialize a gradient-boosting classifier with class imbalance handling
+model = HistGradientBoostingClassifier(
+    loss="log_loss",
+    class_weight="balanced",
+    learning_rate=0.05,
+    max_iter=200,
+    max_depth=8,
+    random_state=42
+)
+# Train on the training set
+model.fit(X_train, y_train)
+# Predict on the test set
+y_pred = model.predict(X_test)
+"""Comment out the following lines to skip evaluation in prod"""
+# print("\nTesting with all transactions...")
+# # Evaluate
+# print("Classification Report:")
+# print(classification_report(y_test, y_pred, digits=4))
+# print("\nConfusion Matrix:")
+# print(confusion_matrix(y_test, y_pred))
+# # Save and load the model using joblib
+def save_model(model, filename='fraud_model.pkl'):
+    """Saves the trained model to a file."""
+    joblib.dump(model, filename)
+    #print(f"Model saved to {filename}")
+save_model(model)
+def load_model(filename='fraud_model.pkl'):
+    """Loads the saved model from a file."""
+    model = joblib.load(filename)
+    #print(f"Model loaded from {filename}")
+    return model
+# Initialize LIME explainer on training data
+explainer = LimeTabularExplainer(
+    training_data=X_train.values,
+    feature_names=X_train.columns.tolist(),
+    class_names=['not_fraud', 'fraud'],
+    mode='classification'
+)
+def extract_top_features(single_row_df, top_n=3):
+    # Generate explanation for the 'fraud' class (label=1)
+    exp = explainer.explain_instance(
+        single_row_df.values[0],
+        lambda arr: model.predict_proba(
+            pd.DataFrame(arr, columns=X_train.columns.tolist())
+        ),
+        num_features=top_n
+    )
+    # Get list of (feature, weight) for the fraud prediction
+    feature_weights = exp.as_list(label=1)
+    # Format the top features into a string
+    formatted = "Transaction's top features:\n"
+    formatted += "\n".join(f"  - {feat}: weight {weight:.4f}" for feat, weight in feature_weights)
+    return formatted

prompts.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Fraud Detection Prompts
+### Placeholder Prompt
+```
+f"Transaction classified as **{status}**.\n"
+f"Top contributing factors according the LIME module:\n{feat_str}\n\n"
+"Please explain why and recommend next investigative steps."
+```
+### Prompt V1
+```
+"You are a professional fraud analyst assisting in reviewing a flagged transaction.\n"
+f"The transaction is classified as **{status}**.\n"
+f"The top contributing factors according to the LIME module:\n{feat_str}\n\n"
+"Briefly explain why this transaction was flagged as such based on the top contributing features.\n"
+"Assess the likelihood of fraud based on the features and their influence\n"
+"Recommend next investigative steps that a business user or fraud team should take.\n"
+"Respond in a formal but concise tone. Your explanation should be understandable to both technical and non-technical users.\n"
+```
+### Prompt V2
+*implements chain of thought lightly*
+```
+"You are a professional fraud analyst assisting in reviewing a flagged transaction.\n"
+f"The transaction is classified as **{status}**.\n"
+f"The top contributing factors according to the LIME module:\n{feat_str}\n\n"
+"Think step-by-step through the features and their weights to understand the model's reasoning.\n"
+"Then:\n"
+"Briefly explain why this transaction was flagged as such based on the top contributing features.\n"
+"Assess the likelihood of fraud based on the features and their influence\n"
+"Recommend next investigative steps that a business user or fraud team should take.\n"
+"Respond in a formal but concise tone. Your explanation should be understandable to both technical and non-technical users.\n"
+```
+### Explicit Chain of Thought
+*add this to the prompt to perform verbose reasoning before making a decision*
+```
+"Walk through your reasoning step-by-step before reaching your conclusions. Show how each feature contributes to your fraud assessment.\n"
+```
+### Optional Guidance For Output Formatting
+*add this to the prompt for formatting the output from the LLM*
+```
+Format your response using **Markdown** as follows:
+**Prediction**: FRAUD
+**Likelihood of Fraud**: (Low / Moderate / High)
+**Reasoning**:
+- Bullet point 1
+- Bullet point 2
+- Bullet point 3
+**Recommended Next Steps**:
+- Step 1
+- Step 2
+- Step 3
+```

reason.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from llama_cpp import Llama
+import pandas as pd
+from model import load_model, extract_top_features
+# load fraud classifier from src/model.py
+fraud_model = None
+def get_fraud_model():
+    global fraud_model
+    if fraud_model is None:
+        fraud_model = load_model()
+    return fraud_model
+# initialize the LLM and tokenizer
+# using the model from Hugging Face
+llm = Llama.from_pretrained(
+            repo_id="lmstudio-community/Nemotron-Research-Reasoning-Qwen-1.5B-GGUF",
+            filename="Nemotron-Research-Reasoning-Qwen-1.5B-Q4_K_M.gguf",
+            verbose=False,
+            n_ctx=131072,  # Match training context length
+            #n_gpu_layers=24 ## Optional for GPU acceleration
+            )
+# runs the LLM reasoning
+def llm_reason(prompt: str) -> str:
+    output = llm.create_chat_completion(
+    	messages = [
+    		{
+    			"role": "user",
+    			"content": prompt
+    		}
+    	]
+    )
+    return output["choices"][0]["message"]["content"]
+# wrapper to build context and call the LLM
+def build_and_call_llm(transaction_df: pd.DataFrame) -> str:
+    # 1) get a fraud prediction + top features
+    model = get_fraud_model()
+    pred = model.predict(transaction_df)[0]
+    feature_contributions = extract_top_features(transaction_df, top_n=3)
+    # 2) assemble a minimal prompt temporarily
+    status = "FRAUD" if pred == 1 else "NORMAL"
+    prompt = (
+        "You are a professional fraud analyst assisting in reviewing a flagged transaction.\n"
+        f"The transaction is classified as **{status}**.\n"
+        f"The top contributing factors and their weights:\n{feature_contributions}\n\n"
+        "Think step-by-step through the features and their weights to understand the classifier's reasoning.\n"
+        "Then:\n"
+        "Explain why this transaction was flagged by each conrtributing factor, explaining why and how the feature contributes to the classification.\n"
+        "Assess the likelihood of fraud based on the features and their influence, including why their influence is drastic in terms of cause and effect.\n"
+        "Recommend the specific and impactful next investigative steps that a business user or fraud team should take (with ample detail) in real life, independent of the features.\n"
+        "Respond in a formal and explainatory tone (don't be too concise). Your explanation should be understandable to both technical and non-technical users.\n"
+        "        Format your response using **Markdown** as follows:\n"
+        "\n"
+        "        **Prediction**: FRAUD  \n"
+        "        **Likelihood of Fraud**: (Low / Moderate / High)  \n"
+        "\n"
+        "        **Reasoning**:  \n"
+        "        - Bullet point 1  \n"
+        "        - Bullet point 2  \n"
+        "        - Bullet point 3  \n"
+        "\n"
+        "        **Recommended Next Steps**:  \n"
+        "        - Step 1  \n"
+        "        - Step 2  \n"
+        "        - Step 3\n"
+    )
+    return llm_reason(prompt)
+# ─── ENTRYPOINT ───────────────────────────────────────────────────────────────—
+def assess_fraud(distanceFromHome, distanceFromLastTransaction, transactionAmount, customerMedianSpend, repeatRetailer, usedChip, usedPin, onlineOrder):
+    data = {
+            "distance_from_home": distanceFromHome,
+            "distance_from_last_transaction": distanceFromLastTransaction,
+            "ratio_to_median_purchase_price": transactionAmount / customerMedianSpend, # Ratio of purchased price transaction to median purchase price
+            "repeat_retailer": float(repeatRetailer), # These variables are boolean and must be converted to float to match the training dataset
+            "used_chip": float(usedChip),
+            "used_pin_number": float(usedPin),
+            "online_order": float(onlineOrder),
+    }
+    df_row = pd.DataFrame([data])
+    # load data, build context, and await the LLM’s explanation
+    explanation = build_and_call_llm(df_row)
+    parts = explanation.split('</think>', 1)
+    if len(parts) > 1:
+        after_think = parts[1].strip()
+        return(after_think)
+    else:
+        return("No </think> tag found.")
+# if __name__ == "__main__":
+#     df = pd.read_csv('src/card_transdata.csv').drop(columns=['fraud']).iloc[0:1] ## Data for testing
+#     assess_fraud(df)