update piguard

Browse files

Files changed (7) hide show

README.md +15 -10
__pycache__/modeling_piguard.cpython-312.pyc +0 -0
config.json +4 -4
inference_examples.py +2 -2
load_model.py +2 -2
modeling_injecguard.py → modeling_piguard.py +6 -6
save_model.py +6 -5

README.md CHANGED Viewed

@@ -11,21 +11,21 @@ library_name: transformers
 ---
 - Website: https://injecguard.github.io/
 - Paper: https://arxiv.org/pdf/2410.22770
-- Code Repo: https://github.com/leolee99/InjecGuard
 ## Abstract
-Prompt injection attacks pose a critical threat to large language models (LLMs), enabling goal hijacking and data leakage. Prompt guard models, though effective in defense, suffer from over-defense—falsely flagging benign inputs as malicious due to trigger word bias. To address this issue, we introduce ***NotInject***, an evaluation dataset that systematically measures over-defense across various prompt guard models. NotInject contains 339 benign samples enriched with trigger words common in prompt injection attacks, enabling fine-grained evaluation. Our results show that state-of-the-art models suffer from over-defense issues, with accuracy dropping close to random guessing levels (60\%). To mitigate this, we propose ***InjecGuard***, a novel prompt guard model that incorporates a new training strategy, *Mitigating Over-defense for Free* (MOF), which significantly reduces the bias on trigger words. InjecGuard demonstrates state-of-the-art performance on diverse benchmarks including NotInject, surpassing the existing best model by 30.8\%, offering a robust and open-source solution for detecting prompt injection attacks.
 ## How to Deploy
-InjecGuard can be easily deployed by excuting:
 ```
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
-tokenizer = AutoTokenizer.from_pretrained("leolee99/InjecGuard")
-model = AutoModelForSequenceClassification.from_pretrained("leolee99/InjecGuard", trust_remote_code=True)
 classifier = pipeline(
 "text-classification",
@@ -59,10 +59,15 @@ We have released an online demo, you can access it [here](InjecGuard.github.io).
 If you find this work useful in your research or applications, we appreciate that if you can kindly cite:
 ```
-@articles{InjecGuard,
-  title={InjecGuard: Benchmarking and Mitigating Over-defense in Prompt Injection Guardrail Models},
-  author={Hao Li and Xiaogeng Liu},
-  journal = {arXiv preprint arXiv:2410.22770},
-  year={2024}
 }
 ```

 ---
 - Website: https://injecguard.github.io/
 - Paper: https://arxiv.org/pdf/2410.22770
+- Code Repo: https://github.com/leolee99/PIGuard
 ## Abstract
+Prompt injection attacks pose a critical threat to large language models (LLMs), enabling goal hijacking and data leakage. Prompt guard models, though effective in defense, suffer from over-defense—falsely flagging benign inputs as malicious due to trigger word bias. To address this issue, we introduce ***NotInject***, an evaluation dataset that systematically measures over-defense across various prompt guard models. NotInject contains 339 benign samples enriched with trigger words common in prompt injection attacks, enabling fine-grained evaluation. Our results show that state-of-the-art models suffer from over-defense issues, with accuracy dropping close to random guessing levels (60\%). To mitigate this, we propose ***PIGuard***, a novel prompt guard model that incorporates a new training strategy, *Mitigating Over-defense for Free* (MOF), which significantly reduces the bias on trigger words. InjecGuard demonstrates state-of-the-art performance on diverse benchmarks including NotInject, surpassing the existing best model by 30.8\%, offering a robust and open-source solution for detecting prompt injection attacks.
 ## How to Deploy
+PIGuard can be easily deployed by excuting:
 ```
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
+model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
 classifier = pipeline(
 "text-classification",
 If you find this work useful in your research or applications, we appreciate that if you can kindly cite:
 ```
+```
+@articles{PIGuard,
+  title={PIGuard: Prompt Injection Guardrail via Mitigating Overdefense for Free},
+  author={Hao Li and
+        Xiaogeng Liu and
+        Ning Zhang and
+        Chaowei Xiao},
+  journal = {ACL},
+  year={2025}
 }
+```
 ```

__pycache__/modeling_piguard.cpython-312.pyc ADDED Viewed

Binary file (1.79 kB). View file

config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "architectures": [
-    "InjecGuard"
   ],
   "attention_probs_dropout_prob": 0.1,
   "auto_map": {
-    "AutoConfig": "modeling_injecguard.InjecGuardConfig",
-    "AutoModelForSequenceClassification": "modeling_injecguard.InjecGuard"
   },
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
@@ -23,7 +23,7 @@
   "layer_norm_eps": 1e-07,
   "max_position_embeddings": 512,
   "max_relative_positions": -1,
-  "model_type": "injecguard",
   "norm_rel_ebd": "layer_norm",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,

 {
   "architectures": [
+    "PIGuard"
   ],
   "attention_probs_dropout_prob": 0.1,
   "auto_map": {
+    "AutoConfig": "modeling_piguard.PIGuardConfig",
+    "AutoModelForSequenceClassification": "modeling_piguard.PIGuard"
   },
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "layer_norm_eps": 1e-07,
   "max_position_embeddings": 512,
   "max_relative_positions": -1,
+  "model_type": "piguard",
   "norm_rel_ebd": "layer_norm",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,

inference_examples.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
-tokenizer = AutoTokenizer.from_pretrained("leolee99/InjecGuard")
-model = AutoModelForSequenceClassification.from_pretrained("leolee99/InjecGuard", trust_remote_code=True)
 classifier = pipeline(
 "text-classification",

 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
+model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
 classifier = pipeline(
 "text-classification",

load_model.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
-tokenizer = AutoTokenizer.from_pretrained("leolee99/InjecGuard")
-model = AutoModelForSequenceClassification.from_pretrained("leolee99/InjecGuard", trust_remote_code=True)
 classifier = pipeline(
 "text-classification",

 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+tokenizer = AutoTokenizer.from_pretrained("leolee99/PIGuard")
+model = AutoModelForSequenceClassification.from_pretrained("leolee99/PIGuard", trust_remote_code=True)
 classifier = pipeline(
 "text-classification",

modeling_injecguard.py → modeling_piguard.py RENAMED Viewed

@@ -3,13 +3,13 @@ from transformers import DebertaV2ForSequenceClassification, DebertaV2Config
 from transformers.modeling_outputs import SequenceClassifierOutput
 import torch
-class InjecGuardConfig(DebertaV2Config):
-    model_type = "injecguard"
-InjecGuardConfig.register_for_auto_class()
-class InjecGuard(DebertaV2ForSequenceClassification):
-    config_class = InjecGuardConfig
     def __init__(self, config):
         super().__init__(config)
@@ -26,4 +26,4 @@ class InjecGuard(DebertaV2ForSequenceClassification):
         logits = self.classifier(pooled_output)
         return SequenceClassifierOutput(logits=logits)
-InjecGuard.register_for_auto_class("AutoModelForSequenceClassification")

 from transformers.modeling_outputs import SequenceClassifierOutput
 import torch
+class PIGuardConfig(DebertaV2Config):
+    model_type = "piguard"
+PIGuardConfig.register_for_auto_class()
+class PIGuard(DebertaV2ForSequenceClassification):
+    config_class = PIGuardConfig
     def __init__(self, config):
         super().__init__(config)
         logits = self.classifier(pooled_output)
         return SequenceClassifierOutput(logits=logits)
+PIGuard.register_for_auto_class("AutoModelForSequenceClassification")

save_model.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import torch
-from modeling_injecguard import InjecGuard, InjecGuardConfig
-config = InjecGuardConfig.from_pretrained("microsoft/deberta-v3-base")
 config.num_labels = 2
-model = InjecGuard(config)
 state_dict = torch.load("/home/hao/epoch_1_600_model.pth")
-model.load_state_dict(state_dict, strict=False)
 model.save_pretrained("saves")

 import torch
+# from safetensors.torch import load_file
+from PIGuard.modeling_piguard import PIGuard, PIGuardConfig
+config = PIGuardConfig.from_pretrained("microsoft/deberta-v3-base")
 config.num_labels = 2
+model = PIGuard(config)
 state_dict = torch.load("/home/hao/epoch_1_600_model.pth")
+# state_dict = load_file("model.safetensors")
+#model.load_state_dict(state_dict, strict=False)
 model.save_pretrained("saves")