Spaces:
Running
Running
Commit
Β·
b0c3821
1
Parent(s):
41aa30f
first app - tokenization works
Browse files- Dockerfile +12 -25
- app.py +0 -55
- llama3-tokenizer-js +1 -0
- run.py +12 -0
- sandbox.ipynb +32 -2
- templates/index.html +0 -44
- static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock β tests/__init__.py +0 -0
- static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock β wm_detector/__init__.py +0 -0
- static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock β wm_detector/core/__init__.py +0 -0
- {src β wm_detector/core}/detector.py +28 -1
- static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json β wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock +0 -0
- templates/styles.css β wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock +0 -0
- wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock +0 -0
- wm_detector/static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json +0 -0
- {static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/02ee80b6196926a5ad790a004d9efd6ab1ba6542 +0 -0
- {static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/4ff488a165e900e5129cda7c20ab32d568d2a475 +0 -0
- {static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 +0 -0
- {static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/refs/main +0 -0
- {static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json +0 -0
- {static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json +0 -0
- {static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json +0 -0
- wm_detector/static/styles.css +68 -0
- wm_detector/templates/index.html +66 -0
- wm_detector/web/__init__.py +0 -0
- wm_detector/web/app.py +50 -0
- wm_detector/web/utils.py +19 -0
Dockerfile
CHANGED
@@ -1,37 +1,24 @@
|
|
1 |
FROM python:3.9-slim
|
2 |
|
3 |
-
# WORKDIR /app
|
4 |
-
|
5 |
-
# COPY requirements.txt .
|
6 |
-
# RUN pip install --no-cache-dir -r requirements.txt
|
7 |
-
|
8 |
-
# COPY . .
|
9 |
-
|
10 |
-
# EXPOSE 8080
|
11 |
-
|
12 |
-
# CMD ["python", "app.py"]
|
13 |
-
|
14 |
-
|
15 |
-
FROM python:3.9-slim
|
16 |
-
|
17 |
-
# Set the working directory inside the container
|
18 |
WORKDIR /app
|
19 |
|
20 |
-
#
|
21 |
COPY requirements.txt .
|
22 |
RUN pip install --no-cache-dir -r requirements.txt
|
23 |
|
24 |
-
# Copy the rest of the application
|
25 |
-
COPY
|
|
|
26 |
|
27 |
-
#
|
28 |
-
|
29 |
|
30 |
-
#
|
31 |
-
|
|
|
32 |
|
33 |
-
#
|
34 |
-
|
35 |
|
36 |
# Command to run the application
|
37 |
-
CMD ["python", "
|
|
|
1 |
FROM python:3.9-slim
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
WORKDIR /app
|
4 |
|
5 |
+
# Copy only the requirements first to leverage Docker cache
|
6 |
COPY requirements.txt .
|
7 |
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
|
9 |
+
# Copy the rest of the application
|
10 |
+
COPY wm_detector/ ./wm_detector/
|
11 |
+
COPY run.py .
|
12 |
|
13 |
+
# Create necessary directories
|
14 |
+
RUN mkdir -p wm_detector/static/hf_cache
|
15 |
|
16 |
+
# Set environment variables
|
17 |
+
ENV PYTHONPATH=/app
|
18 |
+
ENV FLASK_APP=run.py
|
19 |
|
20 |
+
# Expose the port the app runs on
|
21 |
+
EXPOSE 7860
|
22 |
|
23 |
# Command to run the application
|
24 |
+
CMD ["python", "run.py"]
|
app.py
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
docker build -t wm-detector .
|
3 |
-
docker run -p 7860:7860 -v $(pwd)/data:/app/data wm-detector
|
4 |
-
"""
|
5 |
-
|
6 |
-
from flask import Flask, render_template, request
|
7 |
-
import torch
|
8 |
-
import numpy as np
|
9 |
-
from src.detector import MarylandDetector, AutoTokenizer
|
10 |
-
|
11 |
-
app = Flask(__name__)
|
12 |
-
|
13 |
-
# Minimal setup: pick a detector (example: MarylandDetector)
|
14 |
-
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
15 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="static/hf_cache")
|
16 |
-
detector = MarylandDetector(tokenizer=tokenizer)
|
17 |
-
|
18 |
-
def tokenize_text(text):
|
19 |
-
return tokenizer.encode(text, add_special_tokens=False)
|
20 |
-
|
21 |
-
def compute_scores(tokens):
|
22 |
-
# Convert tokens to the detector's format
|
23 |
-
score_list = []
|
24 |
-
for i in range(len(tokens)):
|
25 |
-
if i < detector.ngram:
|
26 |
-
score_list.append(0)
|
27 |
-
continue
|
28 |
-
ngram_tokens = tokens[i-detector.ngram:i]
|
29 |
-
curr_score = detector.score_tok(ngram_tokens, tokens[i]).sum().item()
|
30 |
-
score_list.append(curr_score)
|
31 |
-
# Compute final p-value (example uses sum of scores)
|
32 |
-
final_pvalue = detector.get_pvalue(sum(score_list), len(score_list), 1e-10)
|
33 |
-
return score_list, final_pvalue
|
34 |
-
|
35 |
-
@app.route("/", methods=["GET", "POST"])
|
36 |
-
def index():
|
37 |
-
tokens, colors, pvalue = [], [], None
|
38 |
-
if request.method == "POST":
|
39 |
-
user_text = request.form.get("user_text", "")
|
40 |
-
tokens = tokenize_text(user_text)
|
41 |
-
score_list, pvalue = compute_scores(tokens)
|
42 |
-
# Convert token IDs to text
|
43 |
-
displayed_tokens = tokenizer.convert_ids_to_tokens(tokens)
|
44 |
-
# Assign a simple color scale based on score
|
45 |
-
max_score = max(score_list) if score_list else 1
|
46 |
-
colors = [f"rgba(255, 0, 0, {s/max_score})" if max_score!=0 else "white"
|
47 |
-
for s in score_list]
|
48 |
-
return render_template("index.html",
|
49 |
-
tokens=displayed_tokens,
|
50 |
-
colors=colors,
|
51 |
-
pvalue=pvalue)
|
52 |
-
return render_template("index.html", tokens=tokens, colors=colors, pvalue=pvalue)
|
53 |
-
|
54 |
-
if __name__ == "__main__":
|
55 |
-
app.run(host='0.0.0.0', port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llama3-tokenizer-js
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 064ddf3177369cd82639b43d160ebc83b4c1d362
|
run.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main entry point for the watermark detection application.
|
3 |
+
Run with: python run.py
|
4 |
+
|
5 |
+
docker build -t wm-detector .
|
6 |
+
docker run -p 7860:7860 wm-detector
|
7 |
+
"""
|
8 |
+
|
9 |
+
from wm_detector.web.app import app
|
10 |
+
|
11 |
+
if __name__ == "__main__":
|
12 |
+
app.run(host='0.0.0.0', port=7860)
|
sandbox.ipynb
CHANGED
@@ -9,14 +9,44 @@
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
-
"execution_count":
|
13 |
"metadata": {},
|
14 |
"outputs": [],
|
15 |
"source": [
|
16 |
"from transformers import AutoTokenizer, LlamaForCausalLM\n",
|
17 |
"\n",
|
18 |
"model_id = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
|
19 |
-
"tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=\"static/hf_cache\")"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
]
|
21 |
},
|
22 |
{
|
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
+
"execution_count": 2,
|
13 |
"metadata": {},
|
14 |
"outputs": [],
|
15 |
"source": [
|
16 |
"from transformers import AutoTokenizer, LlamaForCausalLM\n",
|
17 |
"\n",
|
18 |
"model_id = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
|
19 |
+
"tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=\"wm_detector/static/hf_cache\")"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"cell_type": "code",
|
24 |
+
"execution_count": 6,
|
25 |
+
"metadata": {},
|
26 |
+
"outputs": [
|
27 |
+
{
|
28 |
+
"name": "stdout",
|
29 |
+
"output_type": "stream",
|
30 |
+
"text": [
|
31 |
+
"[4438, 311, 1304, 264, 19692]\n",
|
32 |
+
"['How', 'Δ to', 'Δ make', 'Δ a', 'Δ cake']\n",
|
33 |
+
"['How', ' to', ' make', ' a', ' cake']\n"
|
34 |
+
]
|
35 |
+
}
|
36 |
+
],
|
37 |
+
"source": [
|
38 |
+
"def tokenize_text(text):\n",
|
39 |
+
" return tokenizer.encode(text, add_special_tokens=False)\n",
|
40 |
+
"\n",
|
41 |
+
"text = \"How to make a cake\"\n",
|
42 |
+
"token_ids = tokenize_text(text)\n",
|
43 |
+
"tokens = tokenizer.convert_ids_to_tokens(token_ids)\n",
|
44 |
+
"token_strs = [tokenizer.convert_tokens_to_string([token]) for token in tokens]\n",
|
45 |
+
"decoded = tokenizer.decode(tokenize_text(text))\n",
|
46 |
+
"\n",
|
47 |
+
"print(token_ids)\n",
|
48 |
+
"print(tokens)\n",
|
49 |
+
"print(token_strs)"
|
50 |
]
|
51 |
},
|
52 |
{
|
templates/index.html
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
<!DOCTYPE html>
|
2 |
-
<html>
|
3 |
-
<head>
|
4 |
-
<title>Reward Simulator</title>
|
5 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
6 |
-
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
|
7 |
-
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.css">
|
8 |
-
<link rel="stylesheet" href="templates/style.css">
|
9 |
-
</head>
|
10 |
-
<body>
|
11 |
-
<!-- Form to input text and display tokens with color highlights and final p-value -->
|
12 |
-
<form method="POST" action="/">
|
13 |
-
<div class="mb-3">
|
14 |
-
<label for="user_text" class="form-label">Enter text to detect watermark:</label>
|
15 |
-
<textarea class="form-control" id="user_text" name="user_text" rows="5"
|
16 |
-
placeholder="Paste or write your text here..."></textarea>
|
17 |
-
</div>
|
18 |
-
<button type="submit" class="btn btn-primary">Detect</button>
|
19 |
-
</form>
|
20 |
-
|
21 |
-
<!-- Display tokens with color highlighting -->
|
22 |
-
<div class="mt-3">
|
23 |
-
{% if tokens %}
|
24 |
-
<p>
|
25 |
-
{% for i in range(tokens|length) %}
|
26 |
-
<span style="background-color: {{ colors[i] }}">{{ tokens[i] }}</span>
|
27 |
-
{% endfor %}
|
28 |
-
</p>
|
29 |
-
<div class="alert alert-info">Final p-value: {{ pvalue }}</div>
|
30 |
-
{% endif %}
|
31 |
-
</div>
|
32 |
-
|
33 |
-
<!-- Footer Section -->
|
34 |
-
<div class="row mt-5">
|
35 |
-
<div class="col-12">
|
36 |
-
<footer>
|
37 |
-
<p class="text-center text-muted">
|
38 |
-
© 2025 Interactive Text Watermark Detection. Apache 2.0.
|
39 |
-
</p>
|
40 |
-
</footer>
|
41 |
-
</div>
|
42 |
-
</div>
|
43 |
-
</body>
|
44 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock β tests/__init__.py
RENAMED
File without changes
|
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock β wm_detector/__init__.py
RENAMED
File without changes
|
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock β wm_detector/core/__init__.py
RENAMED
File without changes
|
{src β wm_detector/core}/detector.py
RENAMED
@@ -4,7 +4,7 @@
|
|
4 |
# This source code is licensed under the license found in the
|
5 |
# LICENSE file in the root directory of this source tree.
|
6 |
|
7 |
-
from typing import List, Dict, Callable
|
8 |
|
9 |
import numpy as np
|
10 |
from scipy import special
|
@@ -13,6 +13,8 @@ from scipy.optimize import fminbound
|
|
13 |
import torch
|
14 |
from transformers import AutoTokenizer, LlamaForCausalLM
|
15 |
|
|
|
|
|
16 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
17 |
|
18 |
class WmDetector():
|
@@ -143,6 +145,31 @@ class WmDetector():
|
|
143 |
""" compute the p-value for a couple of score and number of tokens """
|
144 |
raise NotImplementedError
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
class MarylandDetector(WmDetector):
|
148 |
|
|
|
4 |
# This source code is licensed under the license found in the
|
5 |
# LICENSE file in the root directory of this source tree.
|
6 |
|
7 |
+
from typing import List, Dict, Callable, Tuple
|
8 |
|
9 |
import numpy as np
|
10 |
from scipy import special
|
|
|
13 |
import torch
|
14 |
from transformers import AutoTokenizer, LlamaForCausalLM
|
15 |
|
16 |
+
import random
|
17 |
+
|
18 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
|
20 |
class WmDetector():
|
|
|
145 |
""" compute the p-value for a couple of score and number of tokens """
|
146 |
raise NotImplementedError
|
147 |
|
148 |
+
def tokenize_text(self, text: str) -> Tuple[List[str], List[str]]:
|
149 |
+
"""
|
150 |
+
Tokenize text and return tokens with their display colors.
|
151 |
+
Returns:
|
152 |
+
tuple: (tokens, colors)
|
153 |
+
"""
|
154 |
+
# Tokenize the text
|
155 |
+
token_ids = self.tokenizer.encode(text, add_special_tokens=False)
|
156 |
+
# Convert ids back to tokens for display
|
157 |
+
tokens = [self.tokenizer.decode([id]) for id in token_ids]
|
158 |
+
# Generate colors for visualization
|
159 |
+
colors = self.generate_token_colors(tokens)
|
160 |
+
|
161 |
+
return tokens, colors
|
162 |
+
|
163 |
+
def generate_token_colors(self, tokens: List[str]) -> List[str]:
|
164 |
+
"""Generate pastel colors for tokens."""
|
165 |
+
def generate_pastel_color():
|
166 |
+
# Generate lighter/pastel colors
|
167 |
+
h = random.random() # Random hue
|
168 |
+
s = 0.3 + random.random() * 0.2 # Saturation between 0.3-0.5
|
169 |
+
l = 0.8 + random.random() * 0.1 # Lightness between 0.8-0.9
|
170 |
+
return f"hsl({h*360}, {s*100}%, {l*100}%)"
|
171 |
+
|
172 |
+
return [generate_pastel_color() for _ in tokens]
|
173 |
|
174 |
class MarylandDetector(WmDetector):
|
175 |
|
static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json β wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock
RENAMED
File without changes
|
templates/styles.css β wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock
RENAMED
File without changes
|
wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock
ADDED
File without changes
|
wm_detector/static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json
ADDED
File without changes
|
{static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/02ee80b6196926a5ad790a004d9efd6ab1ba6542
RENAMED
File without changes
|
{static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/4ff488a165e900e5129cda7c20ab32d568d2a475
RENAMED
File without changes
|
{static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8
RENAMED
File without changes
|
{static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/refs/main
RENAMED
File without changes
|
{static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
RENAMED
File without changes
|
{static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
RENAMED
File without changes
|
{static β wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
RENAMED
File without changes
|
wm_detector/static/styles.css
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
background-color: #f0f0f0;
|
3 |
+
color: #333;
|
4 |
+
font-family: helvetica, sans-serif;
|
5 |
+
line-height: 1.5;
|
6 |
+
padding: 20px;
|
7 |
+
}
|
8 |
+
|
9 |
+
.container {
|
10 |
+
background-color: #fff;
|
11 |
+
border-radius: 5px;
|
12 |
+
padding: 20px;
|
13 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24);
|
14 |
+
max-width: 1200px;
|
15 |
+
margin: 0 auto;
|
16 |
+
}
|
17 |
+
|
18 |
+
h1 {
|
19 |
+
font-size: 24px;
|
20 |
+
margin-bottom: 20px;
|
21 |
+
}
|
22 |
+
|
23 |
+
.input-section textarea {
|
24 |
+
width: 100%;
|
25 |
+
padding: 10px;
|
26 |
+
border: 1px solid #ccc;
|
27 |
+
border-radius: 4px;
|
28 |
+
resize: none;
|
29 |
+
font-size: 14px;
|
30 |
+
line-height: 1.5;
|
31 |
+
height: 200px;
|
32 |
+
}
|
33 |
+
|
34 |
+
.token-display {
|
35 |
+
margin: 20px 0;
|
36 |
+
padding: 10px;
|
37 |
+
border: 1px solid #ccc;
|
38 |
+
border-radius: 4px;
|
39 |
+
background: #f8f9fa;
|
40 |
+
min-height: 100px;
|
41 |
+
font-size: 14px;
|
42 |
+
line-height: 1.5;
|
43 |
+
}
|
44 |
+
|
45 |
+
.token {
|
46 |
+
display: inline-block;
|
47 |
+
padding: 2px 4px;
|
48 |
+
margin: 2px;
|
49 |
+
border-radius: 3px;
|
50 |
+
font-family: monospace;
|
51 |
+
}
|
52 |
+
|
53 |
+
.stats-container {
|
54 |
+
margin-top: 20px;
|
55 |
+
display: flex;
|
56 |
+
gap: 30px;
|
57 |
+
}
|
58 |
+
|
59 |
+
.stat-value {
|
60 |
+
font-size: 32px;
|
61 |
+
font-weight: 500;
|
62 |
+
color: #333;
|
63 |
+
}
|
64 |
+
|
65 |
+
.stat-label {
|
66 |
+
color: #666;
|
67 |
+
font-size: 20px;
|
68 |
+
}
|
wm_detector/templates/index.html
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<title>Watermark Detector</title>
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
6 |
+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
|
7 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
|
8 |
+
</head>
|
9 |
+
<body>
|
10 |
+
<div class="container">
|
11 |
+
<h1>Interactive watermark detector</h1>
|
12 |
+
|
13 |
+
<!-- Input Form -->
|
14 |
+
<div class="input-section">
|
15 |
+
<textarea id="user_text"
|
16 |
+
placeholder="Replace this text in the input field to see how watermark detection works."></textarea>
|
17 |
+
</div>
|
18 |
+
|
19 |
+
<!-- Token Display -->
|
20 |
+
<div class="token-display" id="tokenDisplay"></div>
|
21 |
+
|
22 |
+
<!-- Statistics -->
|
23 |
+
<div class="stats-container">
|
24 |
+
<div>
|
25 |
+
<div class="stat-value" id="tokenCount">0</div>
|
26 |
+
<div class="stat-label">Tokens</div>
|
27 |
+
</div>
|
28 |
+
</div>
|
29 |
+
</div>
|
30 |
+
|
31 |
+
<script>
|
32 |
+
let debounceTimeout;
|
33 |
+
const textarea = document.getElementById('user_text');
|
34 |
+
const tokenDisplay = document.getElementById('tokenDisplay');
|
35 |
+
const tokenCount = document.getElementById('tokenCount');
|
36 |
+
|
37 |
+
async function updateTokenization() {
|
38 |
+
const text = textarea.value;
|
39 |
+
const response = await fetch('/tokenize', {
|
40 |
+
method: 'POST',
|
41 |
+
headers: {
|
42 |
+
'Content-Type': 'application/json',
|
43 |
+
},
|
44 |
+
body: JSON.stringify({ text: text })
|
45 |
+
});
|
46 |
+
const data = await response.json();
|
47 |
+
|
48 |
+
// Update token display
|
49 |
+
tokenDisplay.innerHTML = data.tokens.map((token, i) =>
|
50 |
+
`<span class="token" style="background-color: ${data.colors[i]}">${token}</span>`
|
51 |
+
).join('');
|
52 |
+
|
53 |
+
// Update counts
|
54 |
+
tokenCount.textContent = data.token_count;
|
55 |
+
}
|
56 |
+
|
57 |
+
textarea.addEventListener('input', function() {
|
58 |
+
clearTimeout(debounceTimeout);
|
59 |
+
debounceTimeout = setTimeout(updateTokenization, 300);
|
60 |
+
});
|
61 |
+
|
62 |
+
// Initial tokenization
|
63 |
+
updateTokenization();
|
64 |
+
</script>
|
65 |
+
</body>
|
66 |
+
</html>
|
wm_detector/web/__init__.py
ADDED
File without changes
|
wm_detector/web/app.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main Flask application for the watermark detection web interface.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from flask import Flask, render_template, request, jsonify
|
6 |
+
from ..core.detector import MarylandDetector, AutoTokenizer
|
7 |
+
from .utils import tokenize_text
|
8 |
+
|
9 |
+
def create_app():
|
10 |
+
app = Flask(__name__,
|
11 |
+
static_folder='../static',
|
12 |
+
template_folder='../templates')
|
13 |
+
|
14 |
+
# Add zip to Jinja's global context
|
15 |
+
app.jinja_env.globals.update(zip=zip)
|
16 |
+
|
17 |
+
# Minimal setup: pick a detector (example: MarylandDetector)
|
18 |
+
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="wm_detector/static/hf_cache")
|
20 |
+
detector = MarylandDetector(tokenizer=tokenizer)
|
21 |
+
|
22 |
+
@app.route("/", methods=["GET"])
|
23 |
+
def index():
|
24 |
+
return render_template("index.html")
|
25 |
+
|
26 |
+
@app.route("/tokenize", methods=["POST"])
|
27 |
+
def tokenize():
|
28 |
+
data = request.get_json()
|
29 |
+
text = data.get('text', '')
|
30 |
+
|
31 |
+
if text:
|
32 |
+
tokens, colors, token_ids = tokenize_text(text, tokenizer)
|
33 |
+
return jsonify({
|
34 |
+
'tokens': tokens,
|
35 |
+
'colors': colors,
|
36 |
+
'token_count': len(tokens)
|
37 |
+
})
|
38 |
+
|
39 |
+
return jsonify({
|
40 |
+
'tokens': [],
|
41 |
+
'colors': [],
|
42 |
+
'token_count': 0
|
43 |
+
})
|
44 |
+
|
45 |
+
return app
|
46 |
+
|
47 |
+
app = create_app()
|
48 |
+
|
49 |
+
if __name__ == "__main__":
|
50 |
+
app.run(host='0.0.0.0', port=7860)
|
wm_detector/web/utils.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from ..core.detector import AutoTokenizer
|
3 |
+
|
4 |
+
def generate_pastel_color():
|
5 |
+
"""Generate a pastel color in HSL format."""
|
6 |
+
h = random.random() # Random hue
|
7 |
+
s = 0.3 + random.random() * 0.2 # Saturation between 0.3-0.5
|
8 |
+
l = 0.8 + random.random() * 0.1 # Lightness between 0.8-0.9
|
9 |
+
return f"hsl({h*360}, {s*100}%, {l*100}%)"
|
10 |
+
|
11 |
+
def tokenize_text(text, tokenizer):
|
12 |
+
"""Tokenize text and return tokens with display info."""
|
13 |
+
token_ids = tokenizer.encode(text, add_special_tokens=False)
|
14 |
+
# Convert ids to displayable tokens
|
15 |
+
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
16 |
+
tokens_str = [tokenizer.convert_tokens_to_string([token]) for token in tokens]
|
17 |
+
# Generate pastel colors for each token
|
18 |
+
colors = [generate_pastel_color() for _ in tokens]
|
19 |
+
return tokens_str, colors, token_ids
|