TheFrenchDemos commited on
Commit
b0c3821
Β·
1 Parent(s): 41aa30f

first app - tokenization works

Browse files
Files changed (26) hide show
  1. Dockerfile +12 -25
  2. app.py +0 -55
  3. llama3-tokenizer-js +1 -0
  4. run.py +12 -0
  5. sandbox.ipynb +32 -2
  6. templates/index.html +0 -44
  7. static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock β†’ tests/__init__.py +0 -0
  8. static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock β†’ wm_detector/__init__.py +0 -0
  9. static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock β†’ wm_detector/core/__init__.py +0 -0
  10. {src β†’ wm_detector/core}/detector.py +28 -1
  11. static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json β†’ wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock +0 -0
  12. templates/styles.css β†’ wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock +0 -0
  13. wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock +0 -0
  14. wm_detector/static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json +0 -0
  15. {static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/02ee80b6196926a5ad790a004d9efd6ab1ba6542 +0 -0
  16. {static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/4ff488a165e900e5129cda7c20ab32d568d2a475 +0 -0
  17. {static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 +0 -0
  18. {static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/refs/main +0 -0
  19. {static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json +0 -0
  20. {static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json +0 -0
  21. {static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json +0 -0
  22. wm_detector/static/styles.css +68 -0
  23. wm_detector/templates/index.html +66 -0
  24. wm_detector/web/__init__.py +0 -0
  25. wm_detector/web/app.py +50 -0
  26. wm_detector/web/utils.py +19 -0
Dockerfile CHANGED
@@ -1,37 +1,24 @@
1
  FROM python:3.9-slim
2
 
3
- # WORKDIR /app
4
-
5
- # COPY requirements.txt .
6
- # RUN pip install --no-cache-dir -r requirements.txt
7
-
8
- # COPY . .
9
-
10
- # EXPOSE 8080
11
-
12
- # CMD ["python", "app.py"]
13
-
14
-
15
- FROM python:3.9-slim
16
-
17
- # Set the working directory inside the container
18
  WORKDIR /app
19
 
20
- # Install dependencies
21
  COPY requirements.txt .
22
  RUN pip install --no-cache-dir -r requirements.txt
23
 
24
- # Copy the rest of the application code
25
- COPY . .
 
26
 
27
- # Expose the port your React app runs on
28
- EXPOSE 7860
29
 
30
- # Add ownership to the user
31
- RUN chown -R 1001:1001 /app
 
32
 
33
- # Change to the created user
34
- USER 1001
35
 
36
  # Command to run the application
37
- CMD ["python", "app.py"]
 
1
  FROM python:3.9-slim
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  WORKDIR /app
4
 
5
+ # Copy only the requirements first to leverage Docker cache
6
  COPY requirements.txt .
7
  RUN pip install --no-cache-dir -r requirements.txt
8
 
9
+ # Copy the rest of the application
10
+ COPY wm_detector/ ./wm_detector/
11
+ COPY run.py .
12
 
13
+ # Create necessary directories
14
+ RUN mkdir -p wm_detector/static/hf_cache
15
 
16
+ # Set environment variables
17
+ ENV PYTHONPATH=/app
18
+ ENV FLASK_APP=run.py
19
 
20
+ # Expose the port the app runs on
21
+ EXPOSE 7860
22
 
23
  # Command to run the application
24
+ CMD ["python", "run.py"]
app.py DELETED
@@ -1,55 +0,0 @@
1
- """
2
- docker build -t wm-detector .
3
- docker run -p 7860:7860 -v $(pwd)/data:/app/data wm-detector
4
- """
5
-
6
- from flask import Flask, render_template, request
7
- import torch
8
- import numpy as np
9
- from src.detector import MarylandDetector, AutoTokenizer
10
-
11
- app = Flask(__name__)
12
-
13
- # Minimal setup: pick a detector (example: MarylandDetector)
14
- model_id = "meta-llama/Llama-3.2-1B-Instruct"
15
- tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="static/hf_cache")
16
- detector = MarylandDetector(tokenizer=tokenizer)
17
-
18
- def tokenize_text(text):
19
- return tokenizer.encode(text, add_special_tokens=False)
20
-
21
- def compute_scores(tokens):
22
- # Convert tokens to the detector's format
23
- score_list = []
24
- for i in range(len(tokens)):
25
- if i < detector.ngram:
26
- score_list.append(0)
27
- continue
28
- ngram_tokens = tokens[i-detector.ngram:i]
29
- curr_score = detector.score_tok(ngram_tokens, tokens[i]).sum().item()
30
- score_list.append(curr_score)
31
- # Compute final p-value (example uses sum of scores)
32
- final_pvalue = detector.get_pvalue(sum(score_list), len(score_list), 1e-10)
33
- return score_list, final_pvalue
34
-
35
- @app.route("/", methods=["GET", "POST"])
36
- def index():
37
- tokens, colors, pvalue = [], [], None
38
- if request.method == "POST":
39
- user_text = request.form.get("user_text", "")
40
- tokens = tokenize_text(user_text)
41
- score_list, pvalue = compute_scores(tokens)
42
- # Convert token IDs to text
43
- displayed_tokens = tokenizer.convert_ids_to_tokens(tokens)
44
- # Assign a simple color scale based on score
45
- max_score = max(score_list) if score_list else 1
46
- colors = [f"rgba(255, 0, 0, {s/max_score})" if max_score!=0 else "white"
47
- for s in score_list]
48
- return render_template("index.html",
49
- tokens=displayed_tokens,
50
- colors=colors,
51
- pvalue=pvalue)
52
- return render_template("index.html", tokens=tokens, colors=colors, pvalue=pvalue)
53
-
54
- if __name__ == "__main__":
55
- app.run(host='0.0.0.0', port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama3-tokenizer-js ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 064ddf3177369cd82639b43d160ebc83b4c1d362
run.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main entry point for the watermark detection application.
3
+ Run with: python run.py
4
+
5
+ docker build -t wm-detector .
6
+ docker run -p 7860:7860 wm-detector
7
+ """
8
+
9
+ from wm_detector.web.app import app
10
+
11
+ if __name__ == "__main__":
12
+ app.run(host='0.0.0.0', port=7860)
sandbox.ipynb CHANGED
@@ -9,14 +9,44 @@
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": null,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
16
  "from transformers import AutoTokenizer, LlamaForCausalLM\n",
17
  "\n",
18
  "model_id = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
19
- "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=\"static/hf_cache\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  ]
21
  },
22
  {
 
9
  },
10
  {
11
  "cell_type": "code",
12
+ "execution_count": 2,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
16
  "from transformers import AutoTokenizer, LlamaForCausalLM\n",
17
  "\n",
18
  "model_id = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
19
+ "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=\"wm_detector/static/hf_cache\")"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 6,
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "name": "stdout",
29
+ "output_type": "stream",
30
+ "text": [
31
+ "[4438, 311, 1304, 264, 19692]\n",
32
+ "['How', 'Δ to', 'Δ make', 'Δ a', 'Δ cake']\n",
33
+ "['How', ' to', ' make', ' a', ' cake']\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "def tokenize_text(text):\n",
39
+ " return tokenizer.encode(text, add_special_tokens=False)\n",
40
+ "\n",
41
+ "text = \"How to make a cake\"\n",
42
+ "token_ids = tokenize_text(text)\n",
43
+ "tokens = tokenizer.convert_ids_to_tokens(token_ids)\n",
44
+ "token_strs = [tokenizer.convert_tokens_to_string([token]) for token in tokens]\n",
45
+ "decoded = tokenizer.decode(tokenize_text(text))\n",
46
+ "\n",
47
+ "print(token_ids)\n",
48
+ "print(tokens)\n",
49
+ "print(token_strs)"
50
  ]
51
  },
52
  {
templates/index.html DELETED
@@ -1,44 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <title>Reward Simulator</title>
5
- <meta name="viewport" content="width=device-width, initial-scale=1">
6
- <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
7
- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.css">
8
- <link rel="stylesheet" href="templates/style.css">
9
- </head>
10
- <body>
11
- <!-- Form to input text and display tokens with color highlights and final p-value -->
12
- <form method="POST" action="/">
13
- <div class="mb-3">
14
- <label for="user_text" class="form-label">Enter text to detect watermark:</label>
15
- <textarea class="form-control" id="user_text" name="user_text" rows="5"
16
- placeholder="Paste or write your text here..."></textarea>
17
- </div>
18
- <button type="submit" class="btn btn-primary">Detect</button>
19
- </form>
20
-
21
- <!-- Display tokens with color highlighting -->
22
- <div class="mt-3">
23
- {% if tokens %}
24
- <p>
25
- {% for i in range(tokens|length) %}
26
- <span style="background-color: {{ colors[i] }}">{{ tokens[i] }}</span>
27
- {% endfor %}
28
- </p>
29
- <div class="alert alert-info">Final p-value: {{ pvalue }}</div>
30
- {% endif %}
31
- </div>
32
-
33
- <!-- Footer Section -->
34
- <div class="row mt-5">
35
- <div class="col-12">
36
- <footer>
37
- <p class="text-center text-muted">
38
- &copy; 2025 Interactive Text Watermark Detection. Apache 2.0.
39
- </p>
40
- </footer>
41
- </div>
42
- </div>
43
- </body>
44
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock β†’ tests/__init__.py RENAMED
File without changes
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock β†’ wm_detector/__init__.py RENAMED
File without changes
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock β†’ wm_detector/core/__init__.py RENAMED
File without changes
{src β†’ wm_detector/core}/detector.py RENAMED
@@ -4,7 +4,7 @@
4
  # This source code is licensed under the license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
- from typing import List, Dict, Callable
8
 
9
  import numpy as np
10
  from scipy import special
@@ -13,6 +13,8 @@ from scipy.optimize import fminbound
13
  import torch
14
  from transformers import AutoTokenizer, LlamaForCausalLM
15
 
 
 
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
 
18
  class WmDetector():
@@ -143,6 +145,31 @@ class WmDetector():
143
  """ compute the p-value for a couple of score and number of tokens """
144
  raise NotImplementedError
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  class MarylandDetector(WmDetector):
148
 
 
4
  # This source code is licensed under the license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
+ from typing import List, Dict, Callable, Tuple
8
 
9
  import numpy as np
10
  from scipy import special
 
13
  import torch
14
  from transformers import AutoTokenizer, LlamaForCausalLM
15
 
16
+ import random
17
+
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
 
20
  class WmDetector():
 
145
  """ compute the p-value for a couple of score and number of tokens """
146
  raise NotImplementedError
147
 
148
+ def tokenize_text(self, text: str) -> Tuple[List[str], List[str]]:
149
+ """
150
+ Tokenize text and return tokens with their display colors.
151
+ Returns:
152
+ tuple: (tokens, colors)
153
+ """
154
+ # Tokenize the text
155
+ token_ids = self.tokenizer.encode(text, add_special_tokens=False)
156
+ # Convert ids back to tokens for display
157
+ tokens = [self.tokenizer.decode([id]) for id in token_ids]
158
+ # Generate colors for visualization
159
+ colors = self.generate_token_colors(tokens)
160
+
161
+ return tokens, colors
162
+
163
+ def generate_token_colors(self, tokens: List[str]) -> List[str]:
164
+ """Generate pastel colors for tokens."""
165
+ def generate_pastel_color():
166
+ # Generate lighter/pastel colors
167
+ h = random.random() # Random hue
168
+ s = 0.3 + random.random() * 0.2 # Saturation between 0.3-0.5
169
+ l = 0.8 + random.random() * 0.1 # Lightness between 0.8-0.9
170
+ return f"hsl({h*360}, {s*100}%, {l*100}%)"
171
+
172
+ return [generate_pastel_color() for _ in tokens]
173
 
174
  class MarylandDetector(WmDetector):
175
 
static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json β†’ wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock RENAMED
File without changes
templates/styles.css β†’ wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock RENAMED
File without changes
wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock ADDED
File without changes
wm_detector/static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json ADDED
File without changes
{static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/02ee80b6196926a5ad790a004d9efd6ab1ba6542 RENAMED
File without changes
{static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/4ff488a165e900e5129cda7c20ab32d568d2a475 RENAMED
File without changes
{static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 RENAMED
File without changes
{static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/refs/main RENAMED
File without changes
{static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json RENAMED
File without changes
{static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json RENAMED
File without changes
{static β†’ wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json RENAMED
File without changes
wm_detector/static/styles.css ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ background-color: #f0f0f0;
3
+ color: #333;
4
+ font-family: helvetica, sans-serif;
5
+ line-height: 1.5;
6
+ padding: 20px;
7
+ }
8
+
9
+ .container {
10
+ background-color: #fff;
11
+ border-radius: 5px;
12
+ padding: 20px;
13
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24);
14
+ max-width: 1200px;
15
+ margin: 0 auto;
16
+ }
17
+
18
+ h1 {
19
+ font-size: 24px;
20
+ margin-bottom: 20px;
21
+ }
22
+
23
+ .input-section textarea {
24
+ width: 100%;
25
+ padding: 10px;
26
+ border: 1px solid #ccc;
27
+ border-radius: 4px;
28
+ resize: none;
29
+ font-size: 14px;
30
+ line-height: 1.5;
31
+ height: 200px;
32
+ }
33
+
34
+ .token-display {
35
+ margin: 20px 0;
36
+ padding: 10px;
37
+ border: 1px solid #ccc;
38
+ border-radius: 4px;
39
+ background: #f8f9fa;
40
+ min-height: 100px;
41
+ font-size: 14px;
42
+ line-height: 1.5;
43
+ }
44
+
45
+ .token {
46
+ display: inline-block;
47
+ padding: 2px 4px;
48
+ margin: 2px;
49
+ border-radius: 3px;
50
+ font-family: monospace;
51
+ }
52
+
53
+ .stats-container {
54
+ margin-top: 20px;
55
+ display: flex;
56
+ gap: 30px;
57
+ }
58
+
59
+ .stat-value {
60
+ font-size: 32px;
61
+ font-weight: 500;
62
+ color: #333;
63
+ }
64
+
65
+ .stat-label {
66
+ color: #666;
67
+ font-size: 20px;
68
+ }
wm_detector/templates/index.html ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Watermark Detector</title>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1">
6
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
8
+ </head>
9
+ <body>
10
+ <div class="container">
11
+ <h1>Interactive watermark detector</h1>
12
+
13
+ <!-- Input Form -->
14
+ <div class="input-section">
15
+ <textarea id="user_text"
16
+ placeholder="Replace this text in the input field to see how watermark detection works."></textarea>
17
+ </div>
18
+
19
+ <!-- Token Display -->
20
+ <div class="token-display" id="tokenDisplay"></div>
21
+
22
+ <!-- Statistics -->
23
+ <div class="stats-container">
24
+ <div>
25
+ <div class="stat-value" id="tokenCount">0</div>
26
+ <div class="stat-label">Tokens</div>
27
+ </div>
28
+ </div>
29
+ </div>
30
+
31
+ <script>
32
+ let debounceTimeout;
33
+ const textarea = document.getElementById('user_text');
34
+ const tokenDisplay = document.getElementById('tokenDisplay');
35
+ const tokenCount = document.getElementById('tokenCount');
36
+
37
+ async function updateTokenization() {
38
+ const text = textarea.value;
39
+ const response = await fetch('/tokenize', {
40
+ method: 'POST',
41
+ headers: {
42
+ 'Content-Type': 'application/json',
43
+ },
44
+ body: JSON.stringify({ text: text })
45
+ });
46
+ const data = await response.json();
47
+
48
+ // Update token display
49
+ tokenDisplay.innerHTML = data.tokens.map((token, i) =>
50
+ `<span class="token" style="background-color: ${data.colors[i]}">${token}</span>`
51
+ ).join('');
52
+
53
+ // Update counts
54
+ tokenCount.textContent = data.token_count;
55
+ }
56
+
57
+ textarea.addEventListener('input', function() {
58
+ clearTimeout(debounceTimeout);
59
+ debounceTimeout = setTimeout(updateTokenization, 300);
60
+ });
61
+
62
+ // Initial tokenization
63
+ updateTokenization();
64
+ </script>
65
+ </body>
66
+ </html>
wm_detector/web/__init__.py ADDED
File without changes
wm_detector/web/app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main Flask application for the watermark detection web interface.
3
+ """
4
+
5
+ from flask import Flask, render_template, request, jsonify
6
+ from ..core.detector import MarylandDetector, AutoTokenizer
7
+ from .utils import tokenize_text
8
+
9
+ def create_app():
10
+ app = Flask(__name__,
11
+ static_folder='../static',
12
+ template_folder='../templates')
13
+
14
+ # Add zip to Jinja's global context
15
+ app.jinja_env.globals.update(zip=zip)
16
+
17
+ # Minimal setup: pick a detector (example: MarylandDetector)
18
+ model_id = "meta-llama/Llama-3.2-1B-Instruct"
19
+ tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="wm_detector/static/hf_cache")
20
+ detector = MarylandDetector(tokenizer=tokenizer)
21
+
22
+ @app.route("/", methods=["GET"])
23
+ def index():
24
+ return render_template("index.html")
25
+
26
+ @app.route("/tokenize", methods=["POST"])
27
+ def tokenize():
28
+ data = request.get_json()
29
+ text = data.get('text', '')
30
+
31
+ if text:
32
+ tokens, colors, token_ids = tokenize_text(text, tokenizer)
33
+ return jsonify({
34
+ 'tokens': tokens,
35
+ 'colors': colors,
36
+ 'token_count': len(tokens)
37
+ })
38
+
39
+ return jsonify({
40
+ 'tokens': [],
41
+ 'colors': [],
42
+ 'token_count': 0
43
+ })
44
+
45
+ return app
46
+
47
+ app = create_app()
48
+
49
+ if __name__ == "__main__":
50
+ app.run(host='0.0.0.0', port=7860)
wm_detector/web/utils.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from ..core.detector import AutoTokenizer
3
+
4
+ def generate_pastel_color():
5
+ """Generate a pastel color in HSL format."""
6
+ h = random.random() # Random hue
7
+ s = 0.3 + random.random() * 0.2 # Saturation between 0.3-0.5
8
+ l = 0.8 + random.random() * 0.1 # Lightness between 0.8-0.9
9
+ return f"hsl({h*360}, {s*100}%, {l*100}%)"
10
+
11
+ def tokenize_text(text, tokenizer):
12
+ """Tokenize text and return tokens with display info."""
13
+ token_ids = tokenizer.encode(text, add_special_tokens=False)
14
+ # Convert ids to displayable tokens
15
+ tokens = tokenizer.convert_ids_to_tokens(token_ids)
16
+ tokens_str = [tokenizer.convert_tokens_to_string([token]) for token in tokens]
17
+ # Generate pastel colors for each token
18
+ colors = [generate_pastel_color() for _ in tokens]
19
+ return tokens_str, colors, token_ids