leolee99 commited on
Commit
5b65d96
·
1 Parent(s): ec04dac

update piguard

Browse files
README.md CHANGED
@@ -11,21 +11,21 @@ library_name: transformers
11
  ---
12
  - Website: https://injecguard.github.io/
13
  - Paper: https://arxiv.org/pdf/2410.22770
14
- - Code Repo: https://github.com/leolee99/InjecGuard
15
 
16
  ## Abstract
17
 
18
- Prompt injection attacks pose a critical threat to large language models (LLMs), enabling goal hijacking and data leakage. Prompt guard models, though effective in defense, suffer from over-defense—falsely flagging benign inputs as malicious due to trigger word bias. To address this issue, we introduce ***NotInject***, an evaluation dataset that systematically measures over-defense across various prompt guard models. NotInject contains 339 benign samples enriched with trigger words common in prompt injection attacks, enabling fine-grained evaluation. Our results show that state-of-the-art models suffer from over-defense issues, with accuracy dropping close to random guessing levels (60\%). To mitigate this, we propose ***InjecGuard***, a novel prompt guard model that incorporates a new training strategy, *Mitigating Over-defense for Free* (MOF), which significantly reduces the bias on trigger words. InjecGuard demonstrates state-of-the-art performance on diverse benchmarks including NotInject, surpassing the existing best model by 30.8\%, offering a robust and open-source solution for detecting prompt injection attacks.
19
 
20
  ## How to Deploy
21
 
22
- InjecGuard can be easily deployed by excuting:
23
 
24
  ```
25
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
26
 
27
- tokenizer = AutoTokenizer.from_pretrained("leolee99/InjecGuard")
28
- model = AutoModelForSequenceClassification.from_pretrained("leolee99/InjecGuard", trust_remote_code=True)
29
 
30
  classifier = pipeline(
31
  "text-classification",
@@ -59,10 +59,15 @@ We have released an online demo, you can access it [here](InjecGuard.github.io).
59
  If you find this work useful in your research or applications, we appreciate that if you can kindly cite:
60
 
61
  ```
62
- @articles{InjecGuard,
63
- title={InjecGuard: Benchmarking and Mitigating Over-defense in Prompt Injection Guardrail Models},
64
- author={Hao Li and Xiaogeng Liu},
65
- journal = {arXiv preprint arXiv:2410.22770},
66
- year={2024}
 
 
 
 
67
  }
 
68
  ```
 
11
  ---
12
  - Website: https://injecguard.github.io/
13
  - Paper: https://arxiv.org/pdf/2410.22770
14
+ - Code Repo: https://github.com/leolee99/PIGuard
15
 
16
  ## Abstract
17
 
18
+ Prompt injection attacks pose a critical threat to large language models (LLMs), enabling goal hijacking and data leakage. Prompt guard models, though effective in defense, suffer from over-defense—falsely flagging benign inputs as malicious due to trigger word bias. To address this issue, we introduce ***NotInject***, an evaluation dataset that systematically measures over-defense across various prompt guard models. NotInject contains 339 benign samples enriched with trigger words common in prompt injection attacks, enabling fine-grained evaluation. Our results show that state-of-the-art models suffer from over-defense issues, with accuracy dropping close to random guessing levels (60\%). To mitigate this, we propose ***PIGuard***, a novel prompt guard model that incorporates a new training strategy, *Mitigating Over-defense for Free* (MOF), which significantly reduces the bias on trigger words. InjecGuard demonstrates state-of-the-art performance on diverse benchmarks including NotInject, surpassing the existing best model by 30.8\%, offering a robust and open-source solution for detecting prompt injection attacks.
19
 
20
  ## How to Deploy
21
 
22
+ PIGuard can be easily deployed by excuting:
23
 
24
  ```
25
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
26
 
27
+ tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
28
+ model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
29
 
30
  classifier = pipeline(
31
  "text-classification",
 
59
  If you find this work useful in your research or applications, we appreciate that if you can kindly cite:
60
 
61
  ```
62
+ ```
63
+ @articles{PIGuard,
64
+ title={PIGuard: Prompt Injection Guardrail via Mitigating Overdefense for Free},
65
+ author={Hao Li and
66
+ Xiaogeng Liu and
67
+ Ning Zhang and
68
+ Chaowei Xiao},
69
+ journal = {ACL},
70
+ year={2025}
71
  }
72
+ ```
73
  ```
__pycache__/modeling_piguard.cpython-312.pyc ADDED
Binary file (1.79 kB). View file
 
config.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "architectures": [
3
- "InjecGuard"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "auto_map": {
7
- "AutoConfig": "modeling_injecguard.InjecGuardConfig",
8
- "AutoModelForSequenceClassification": "modeling_injecguard.InjecGuard"
9
  },
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
@@ -23,7 +23,7 @@
23
  "layer_norm_eps": 1e-07,
24
  "max_position_embeddings": 512,
25
  "max_relative_positions": -1,
26
- "model_type": "injecguard",
27
  "norm_rel_ebd": "layer_norm",
28
  "num_attention_heads": 12,
29
  "num_hidden_layers": 12,
 
1
  {
2
  "architectures": [
3
+ "PIGuard"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "auto_map": {
7
+ "AutoConfig": "modeling_piguard.PIGuardConfig",
8
+ "AutoModelForSequenceClassification": "modeling_piguard.PIGuard"
9
  },
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
 
23
  "layer_norm_eps": 1e-07,
24
  "max_position_embeddings": 512,
25
  "max_relative_positions": -1,
26
+ "model_type": "piguard",
27
  "norm_rel_ebd": "layer_norm",
28
  "num_attention_heads": 12,
29
  "num_hidden_layers": 12,
inference_examples.py CHANGED
@@ -1,8 +1,8 @@
1
  import torch
2
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
 
4
- tokenizer = AutoTokenizer.from_pretrained("leolee99/InjecGuard")
5
- model = AutoModelForSequenceClassification.from_pretrained("leolee99/InjecGuard", trust_remote_code=True)
6
 
7
  classifier = pipeline(
8
  "text-classification",
 
1
  import torch
2
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
 
4
+ tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
5
+ model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
6
 
7
  classifier = pipeline(
8
  "text-classification",
load_model.py CHANGED
@@ -1,8 +1,8 @@
1
  import torch
2
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
 
4
- tokenizer = AutoTokenizer.from_pretrained("leolee99/InjecGuard")
5
- model = AutoModelForSequenceClassification.from_pretrained("leolee99/InjecGuard", trust_remote_code=True)
6
 
7
  classifier = pipeline(
8
  "text-classification",
 
1
  import torch
2
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
 
4
+ tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
5
+ model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
6
 
7
  classifier = pipeline(
8
  "text-classification",
modeling_injecguard.py → modeling_piguard.py RENAMED
@@ -3,13 +3,13 @@ from transformers import DebertaV2ForSequenceClassification, DebertaV2Config
3
  from transformers.modeling_outputs import SequenceClassifierOutput
4
  import torch
5
 
6
- class InjecGuardConfig(DebertaV2Config):
7
- model_type = "injecguard"
8
 
9
- InjecGuardConfig.register_for_auto_class()
10
 
11
- class InjecGuard(DebertaV2ForSequenceClassification):
12
- config_class = InjecGuardConfig
13
 
14
  def __init__(self, config):
15
  super().__init__(config)
@@ -26,4 +26,4 @@ class InjecGuard(DebertaV2ForSequenceClassification):
26
  logits = self.classifier(pooled_output)
27
  return SequenceClassifierOutput(logits=logits)
28
 
29
- InjecGuard.register_for_auto_class("AutoModelForSequenceClassification")
 
3
  from transformers.modeling_outputs import SequenceClassifierOutput
4
  import torch
5
 
6
+ class PIGuardConfig(DebertaV2Config):
7
+ model_type = "piguard"
8
 
9
+ PIGuardConfig.register_for_auto_class()
10
 
11
+ class PIGuard(DebertaV2ForSequenceClassification):
12
+ config_class = PIGuardConfig
13
 
14
  def __init__(self, config):
15
  super().__init__(config)
 
26
  logits = self.classifier(pooled_output)
27
  return SequenceClassifierOutput(logits=logits)
28
 
29
+ PIGuard.register_for_auto_class("AutoModelForSequenceClassification")
save_model.py CHANGED
@@ -1,11 +1,12 @@
1
  import torch
2
- from modeling_injecguard import InjecGuard, InjecGuardConfig
3
-
4
- config = InjecGuardConfig.from_pretrained("microsoft/deberta-v3-base")
5
  config.num_labels = 2
6
 
7
- model = InjecGuard(config)
8
 
9
  state_dict = torch.load("/home/hao/epoch_1_600_model.pth")
10
- model.load_state_dict(state_dict, strict=False)
 
11
  model.save_pretrained("saves")
 
1
  import torch
2
+ # from safetensors.torch import load_file
3
+ from PIGuard.modeling_piguard import PIGuard, PIGuardConfig
4
+ config = PIGuardConfig.from_pretrained("microsoft/deberta-v3-base")
5
  config.num_labels = 2
6
 
7
+ model = PIGuard(config)
8
 
9
  state_dict = torch.load("/home/hao/epoch_1_600_model.pth")
10
+ # state_dict = load_file("model.safetensors")
11
+ #model.load_state_dict(state_dict, strict=False)
12
  model.save_pretrained("saves")