update piguard
Browse files- README.md +15 -10
- __pycache__/modeling_piguard.cpython-312.pyc +0 -0
- config.json +4 -4
- inference_examples.py +2 -2
- load_model.py +2 -2
- modeling_injecguard.py → modeling_piguard.py +6 -6
- save_model.py +6 -5
README.md
CHANGED
|
@@ -11,21 +11,21 @@ library_name: transformers
|
|
| 11 |
---
|
| 12 |
- Website: https://injecguard.github.io/
|
| 13 |
- Paper: https://arxiv.org/pdf/2410.22770
|
| 14 |
-
- Code Repo: https://github.com/leolee99/
|
| 15 |
|
| 16 |
## Abstract
|
| 17 |
|
| 18 |
-
Prompt injection attacks pose a critical threat to large language models (LLMs), enabling goal hijacking and data leakage. Prompt guard models, though effective in defense, suffer from over-defense—falsely flagging benign inputs as malicious due to trigger word bias. To address this issue, we introduce ***NotInject***, an evaluation dataset that systematically measures over-defense across various prompt guard models. NotInject contains 339 benign samples enriched with trigger words common in prompt injection attacks, enabling fine-grained evaluation. Our results show that state-of-the-art models suffer from over-defense issues, with accuracy dropping close to random guessing levels (60\%). To mitigate this, we propose ***
|
| 19 |
|
| 20 |
## How to Deploy
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
```
|
| 25 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
| 26 |
|
| 27 |
-
tokenizer = AutoTokenizer.from_pretrained("leolee99/
|
| 28 |
-
model = AutoModelForSequenceClassification.from_pretrained("leolee99/
|
| 29 |
|
| 30 |
classifier = pipeline(
|
| 31 |
"text-classification",
|
|
@@ -59,10 +59,15 @@ We have released an online demo, you can access it [here](InjecGuard.github.io).
|
|
| 59 |
If you find this work useful in your research or applications, we appreciate that if you can kindly cite:
|
| 60 |
|
| 61 |
```
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
}
|
|
|
|
| 68 |
```
|
|
|
|
| 11 |
---
|
| 12 |
- Website: https://injecguard.github.io/
|
| 13 |
- Paper: https://arxiv.org/pdf/2410.22770
|
| 14 |
+
- Code Repo: https://github.com/leolee99/PIGuard
|
| 15 |
|
| 16 |
## Abstract
|
| 17 |
|
| 18 |
+
Prompt injection attacks pose a critical threat to large language models (LLMs), enabling goal hijacking and data leakage. Prompt guard models, though effective in defense, suffer from over-defense—falsely flagging benign inputs as malicious due to trigger word bias. To address this issue, we introduce ***NotInject***, an evaluation dataset that systematically measures over-defense across various prompt guard models. NotInject contains 339 benign samples enriched with trigger words common in prompt injection attacks, enabling fine-grained evaluation. Our results show that state-of-the-art models suffer from over-defense issues, with accuracy dropping close to random guessing levels (60\%). To mitigate this, we propose ***PIGuard***, a novel prompt guard model that incorporates a new training strategy, *Mitigating Over-defense for Free* (MOF), which significantly reduces the bias on trigger words. InjecGuard demonstrates state-of-the-art performance on diverse benchmarks including NotInject, surpassing the existing best model by 30.8\%, offering a robust and open-source solution for detecting prompt injection attacks.
|
| 19 |
|
| 20 |
## How to Deploy
|
| 21 |
|
| 22 |
+
PIGuard can be easily deployed by excuting:
|
| 23 |
|
| 24 |
```
|
| 25 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
| 26 |
|
| 27 |
+
tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
|
| 28 |
+
model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
|
| 29 |
|
| 30 |
classifier = pipeline(
|
| 31 |
"text-classification",
|
|
|
|
| 59 |
If you find this work useful in your research or applications, we appreciate that if you can kindly cite:
|
| 60 |
|
| 61 |
```
|
| 62 |
+
```
|
| 63 |
+
@articles{PIGuard,
|
| 64 |
+
title={PIGuard: Prompt Injection Guardrail via Mitigating Overdefense for Free},
|
| 65 |
+
author={Hao Li and
|
| 66 |
+
Xiaogeng Liu and
|
| 67 |
+
Ning Zhang and
|
| 68 |
+
Chaowei Xiao},
|
| 69 |
+
journal = {ACL},
|
| 70 |
+
year={2025}
|
| 71 |
}
|
| 72 |
+
```
|
| 73 |
```
|
__pycache__/modeling_piguard.cpython-312.pyc
ADDED
|
Binary file (1.79 kB). View file
|
|
|
config.json
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"attention_probs_dropout_prob": 0.1,
|
| 6 |
"auto_map": {
|
| 7 |
-
"AutoConfig": "
|
| 8 |
-
"AutoModelForSequenceClassification": "
|
| 9 |
},
|
| 10 |
"hidden_act": "gelu",
|
| 11 |
"hidden_dropout_prob": 0.1,
|
|
@@ -23,7 +23,7 @@
|
|
| 23 |
"layer_norm_eps": 1e-07,
|
| 24 |
"max_position_embeddings": 512,
|
| 25 |
"max_relative_positions": -1,
|
| 26 |
-
"model_type": "
|
| 27 |
"norm_rel_ebd": "layer_norm",
|
| 28 |
"num_attention_heads": 12,
|
| 29 |
"num_hidden_layers": 12,
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"PIGuard"
|
| 4 |
],
|
| 5 |
"attention_probs_dropout_prob": 0.1,
|
| 6 |
"auto_map": {
|
| 7 |
+
"AutoConfig": "modeling_piguard.PIGuardConfig",
|
| 8 |
+
"AutoModelForSequenceClassification": "modeling_piguard.PIGuard"
|
| 9 |
},
|
| 10 |
"hidden_act": "gelu",
|
| 11 |
"hidden_dropout_prob": 0.1,
|
|
|
|
| 23 |
"layer_norm_eps": 1e-07,
|
| 24 |
"max_position_embeddings": 512,
|
| 25 |
"max_relative_positions": -1,
|
| 26 |
+
"model_type": "piguard",
|
| 27 |
"norm_rel_ebd": "layer_norm",
|
| 28 |
"num_attention_heads": 12,
|
| 29 |
"num_hidden_layers": 12,
|
inference_examples.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
| 3 |
|
| 4 |
-
tokenizer = AutoTokenizer.from_pretrained("leolee99/
|
| 5 |
-
model = AutoModelForSequenceClassification.from_pretrained("leolee99/
|
| 6 |
|
| 7 |
classifier = pipeline(
|
| 8 |
"text-classification",
|
|
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
| 3 |
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
|
| 5 |
+
model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
|
| 6 |
|
| 7 |
classifier = pipeline(
|
| 8 |
"text-classification",
|
load_model.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
| 3 |
|
| 4 |
-
tokenizer = AutoTokenizer.from_pretrained("leolee99/
|
| 5 |
-
model = AutoModelForSequenceClassification.from_pretrained("leolee99/
|
| 6 |
|
| 7 |
classifier = pipeline(
|
| 8 |
"text-classification",
|
|
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
| 3 |
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
|
| 5 |
+
model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
|
| 6 |
|
| 7 |
classifier = pipeline(
|
| 8 |
"text-classification",
|
modeling_injecguard.py → modeling_piguard.py
RENAMED
|
@@ -3,13 +3,13 @@ from transformers import DebertaV2ForSequenceClassification, DebertaV2Config
|
|
| 3 |
from transformers.modeling_outputs import SequenceClassifierOutput
|
| 4 |
import torch
|
| 5 |
|
| 6 |
-
class
|
| 7 |
-
model_type = "
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
-
class
|
| 12 |
-
config_class =
|
| 13 |
|
| 14 |
def __init__(self, config):
|
| 15 |
super().__init__(config)
|
|
@@ -26,4 +26,4 @@ class InjecGuard(DebertaV2ForSequenceClassification):
|
|
| 26 |
logits = self.classifier(pooled_output)
|
| 27 |
return SequenceClassifierOutput(logits=logits)
|
| 28 |
|
| 29 |
-
|
|
|
|
| 3 |
from transformers.modeling_outputs import SequenceClassifierOutput
|
| 4 |
import torch
|
| 5 |
|
| 6 |
+
class PIGuardConfig(DebertaV2Config):
|
| 7 |
+
model_type = "piguard"
|
| 8 |
|
| 9 |
+
PIGuardConfig.register_for_auto_class()
|
| 10 |
|
| 11 |
+
class PIGuard(DebertaV2ForSequenceClassification):
|
| 12 |
+
config_class = PIGuardConfig
|
| 13 |
|
| 14 |
def __init__(self, config):
|
| 15 |
super().__init__(config)
|
|
|
|
| 26 |
logits = self.classifier(pooled_output)
|
| 27 |
return SequenceClassifierOutput(logits=logits)
|
| 28 |
|
| 29 |
+
PIGuard.register_for_auto_class("AutoModelForSequenceClassification")
|
save_model.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import torch
|
| 2 |
-
from
|
| 3 |
-
|
| 4 |
-
config =
|
| 5 |
config.num_labels = 2
|
| 6 |
|
| 7 |
-
model =
|
| 8 |
|
| 9 |
state_dict = torch.load("/home/hao/epoch_1_600_model.pth")
|
| 10 |
-
|
|
|
|
| 11 |
model.save_pretrained("saves")
|
|
|
|
| 1 |
import torch
|
| 2 |
+
# from safetensors.torch import load_file
|
| 3 |
+
from PIGuard.modeling_piguard import PIGuard, PIGuardConfig
|
| 4 |
+
config = PIGuardConfig.from_pretrained("microsoft/deberta-v3-base")
|
| 5 |
config.num_labels = 2
|
| 6 |
|
| 7 |
+
model = PIGuard(config)
|
| 8 |
|
| 9 |
state_dict = torch.load("/home/hao/epoch_1_600_model.pth")
|
| 10 |
+
# state_dict = load_file("model.safetensors")
|
| 11 |
+
#model.load_state_dict(state_dict, strict=False)
|
| 12 |
model.save_pretrained("saves")
|