Spaces:

TheFrenchDemos
/

wm-detection

Running

App Files Files Community

TheFrenchDemos commited on 16 days ago

Commit

b0c3821

1 Parent(s): 41aa30f

first app - tokenization works

Browse files

Files changed (26) hide show

Dockerfile +12 -25
app.py +0 -55
llama3-tokenizer-js +1 -0
run.py +12 -0
sandbox.ipynb +32 -2
templates/index.html +0 -44
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock → tests/__init__.py +0 -0
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock → wm_detector/__init__.py +0 -0
static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock → wm_detector/core/__init__.py +0 -0
{src → wm_detector/core}/detector.py +28 -1
static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json → wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock +0 -0
templates/styles.css → wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock +0 -0
wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock +0 -0
wm_detector/static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json +0 -0
{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/02ee80b6196926a5ad790a004d9efd6ab1ba6542 +0 -0
{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/4ff488a165e900e5129cda7c20ab32d568d2a475 +0 -0
{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 +0 -0
{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/refs/main +0 -0
{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json +0 -0
{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json +0 -0
{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json +0 -0
wm_detector/static/styles.css +68 -0
wm_detector/templates/index.html +66 -0
wm_detector/web/__init__.py +0 -0
wm_detector/web/app.py +50 -0
wm_detector/web/utils.py +19 -0

Dockerfile CHANGED Viewed

@@ -1,37 +1,24 @@
 FROM python:3.9-slim
-# WORKDIR /app
-# COPY requirements.txt .
-# RUN pip install --no-cache-dir -r requirements.txt
-# COPY . .
-# EXPOSE 8080
-# CMD ["python", "app.py"]
-FROM python:3.9-slim
-# Set the working directory inside the container
 WORKDIR /app
-# Install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application code
-COPY . .
-# Expose the port your React app runs on
-EXPOSE 7860
-# Add ownership to the user
-RUN chown -R 1001:1001 /app
-# Change to the created user
-USER 1001
 # Command to run the application
-CMD ["python", "app.py"]

 FROM python:3.9-slim
 WORKDIR /app
+# Copy only the requirements first to leverage Docker cache
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY wm_detector/ ./wm_detector/
+COPY run.py .
+# Create necessary directories
+RUN mkdir -p wm_detector/static/hf_cache
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV FLASK_APP=run.py
+# Expose the port the app runs on
+EXPOSE 7860
 # Command to run the application
+CMD ["python", "run.py"]

app.py DELETED Viewed

@@ -1,55 +0,0 @@
-"""
-docker build -t wm-detector .
-docker run -p 7860:7860 -v $(pwd)/data:/app/data wm-detector
-"""
-from flask import Flask, render_template, request
-import torch
-import numpy as np
-from src.detector import MarylandDetector, AutoTokenizer
-app = Flask(__name__)
-# Minimal setup: pick a detector (example: MarylandDetector)
-model_id = "meta-llama/Llama-3.2-1B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="static/hf_cache")
-detector = MarylandDetector(tokenizer=tokenizer)
-def tokenize_text(text):
-    return tokenizer.encode(text, add_special_tokens=False)
-def compute_scores(tokens):
-    # Convert tokens to the detector's format
-    score_list = []
-    for i in range(len(tokens)):
-        if i < detector.ngram:
-            score_list.append(0)
-            continue
-        ngram_tokens = tokens[i-detector.ngram:i]
-        curr_score = detector.score_tok(ngram_tokens, tokens[i]).sum().item()
-        score_list.append(curr_score)
-    # Compute final p-value (example uses sum of scores)
-    final_pvalue = detector.get_pvalue(sum(score_list), len(score_list), 1e-10)
-    return score_list, final_pvalue
-@app.route("/", methods=["GET", "POST"])
-def index():
-    tokens, colors, pvalue = [], [], None
-    if request.method == "POST":
-        user_text = request.form.get("user_text", "")
-        tokens = tokenize_text(user_text)
-        score_list, pvalue = compute_scores(tokens)
-        # Convert token IDs to text
-        displayed_tokens = tokenizer.convert_ids_to_tokens(tokens)
-        # Assign a simple color scale based on score
-        max_score = max(score_list) if score_list else 1
-        colors = [f"rgba(255, 0, 0, {s/max_score})" if max_score!=0 else "white"
-                  for s in score_list]
-        return render_template("index.html",
-                               tokens=displayed_tokens,
-                               colors=colors,
-                               pvalue=pvalue)
-    return render_template("index.html", tokens=tokens, colors=colors, pvalue=pvalue)
-if __name__ == "__main__":
-    app.run(host='0.0.0.0', port=7860)

llama3-tokenizer-js ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 064ddf3177369cd82639b43d160ebc83b4c1d362

run.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Main entry point for the watermark detection application.
+Run with: python run.py
+docker build -t wm-detector .
+docker run -p 7860:7860 wm-detector
+"""
+from wm_detector.web.app import app
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=7860)

sandbox.ipynb CHANGED Viewed

@@ -9,14 +9,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer, LlamaForCausalLM\n",
     "\n",
     "model_id = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=\"static/hf_cache\")"
    ]
   },
   {

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer, LlamaForCausalLM\n",
     "\n",
     "model_id = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=\"wm_detector/static/hf_cache\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[4438, 311, 1304, 264, 19692]\n",
+      "['How', 'Ġto', 'Ġmake', 'Ġa', 'Ġcake']\n",
+      "['How', ' to', ' make', ' a', ' cake']\n"
+     ]
+    }
+   ],
+   "source": [
+    "def tokenize_text(text):\n",
+    "    return tokenizer.encode(text, add_special_tokens=False)\n",
+    "\n",
+    "text = \"How to make a cake\"\n",
+    "token_ids = tokenize_text(text)\n",
+    "tokens = tokenizer.convert_ids_to_tokens(token_ids)\n",
+    "token_strs = [tokenizer.convert_tokens_to_string([token]) for token in tokens]\n",
+    "decoded = tokenizer.decode(tokenize_text(text))\n",
+    "\n",
+    "print(token_ids)\n",
+    "print(tokens)\n",
+    "print(token_strs)"
    ]
   },
   {

templates/index.html DELETED Viewed

@@ -1,44 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Reward Simulator</title>
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
-    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.css">
-    <link rel="stylesheet" href="templates/style.css">
-</head>
-<body>
-    <!-- Form to input text and display tokens with color highlights and final p-value -->
-    <form method="POST" action="/">
-        <div class="mb-3">
-            <label for="user_text" class="form-label">Enter text to detect watermark:</label>
-            <textarea class="form-control" id="user_text" name="user_text" rows="5"
-                      placeholder="Paste or write your text here..."></textarea>
-        </div>
-        <button type="submit" class="btn btn-primary">Detect</button>
-    </form>
-    <!-- Display tokens with color highlighting -->
-    <div class="mt-3">
-        {% if tokens %}
-            <p>
-                {% for i in range(tokens|length) %}
-                <span style="background-color: {{ colors[i] }}">{{ tokens[i] }}</span>
-                {% endfor %}
-            </p>
-            <div class="alert alert-info">Final p-value: {{ pvalue }}</div>
-        {% endif %}
-    </div>
-    <!-- Footer Section -->
-    <div class="row mt-5">
-        <div class="col-12">
-            <footer>
-                <p class="text-center text-muted">
-                    &copy; 2025 Interactive Text Watermark Detection. Apache 2.0.
-                </p>
-            </footer>
-        </div>
-    </div>
-</body>
-</html>

static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock → tests/__init__.py RENAMED Viewed

File without changes

static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock → wm_detector/__init__.py RENAMED Viewed

File without changes

static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock → wm_detector/core/__init__.py RENAMED Viewed

File without changes

{src → wm_detector/core}/detector.py RENAMED Viewed

@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import List, Dict, Callable
 import numpy as np
 from scipy import special
@@ -13,6 +13,8 @@ from scipy.optimize import fminbound
 import torch
 from transformers import AutoTokenizer, LlamaForCausalLM
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class WmDetector():
@@ -143,6 +145,31 @@ class WmDetector():
         """ compute the p-value for a couple of score and number of tokens """
         raise NotImplementedError
 class MarylandDetector(WmDetector):

 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+from typing import List, Dict, Callable, Tuple
 import numpy as np
 from scipy import special
 import torch
 from transformers import AutoTokenizer, LlamaForCausalLM
+import random
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class WmDetector():
         """ compute the p-value for a couple of score and number of tokens """
         raise NotImplementedError
+    def tokenize_text(self, text: str) -> Tuple[List[str], List[str]]:
+        """
+        Tokenize text and return tokens with their display colors.
+        Returns:
+            tuple: (tokens, colors)
+        """
+        # Tokenize the text
+        token_ids = self.tokenizer.encode(text, add_special_tokens=False)
+        # Convert ids back to tokens for display
+        tokens = [self.tokenizer.decode([id]) for id in token_ids]
+        # Generate colors for visualization
+        colors = self.generate_token_colors(tokens)
+        return tokens, colors
+    def generate_token_colors(self, tokens: List[str]) -> List[str]:
+        """Generate pastel colors for tokens."""
+        def generate_pastel_color():
+            # Generate lighter/pastel colors
+            h = random.random()  # Random hue
+            s = 0.3 + random.random() * 0.2  # Saturation between 0.3-0.5
+            l = 0.8 + random.random() * 0.1  # Lightness between 0.8-0.9
+            return f"hsl({h*360}, {s*100}%, {l*100}%)"
+        return [generate_pastel_color() for _ in tokens]
 class MarylandDetector(WmDetector):

static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json → wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/02ee80b6196926a5ad790a004d9efd6ab1ba6542.lock RENAMED Viewed

File without changes

templates/styles.css → wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/4ff488a165e900e5129cda7c20ab32d568d2a475.lock RENAMED Viewed

File without changes

wm_detector/static/hf_cache/.locks/models--meta-llama--Llama-3.2-1B-Instruct/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8.lock ADDED Viewed

File without changes

wm_detector/static/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/.no_exist/9213176726f574b556790deb65791e0c5aa438b6/added_tokens.json ADDED Viewed

File without changes

{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/02ee80b6196926a5ad790a004d9efd6ab1ba6542 RENAMED Viewed

File without changes

{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/4ff488a165e900e5129cda7c20ab32d568d2a475 RENAMED Viewed

File without changes

{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/blobs/5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 RENAMED Viewed

File without changes

{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/refs/main RENAMED Viewed

File without changes

{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json RENAMED Viewed

File without changes

{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json RENAMED Viewed

File without changes

{static → wm_detector/static}/hf_cache/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json RENAMED Viewed

File without changes

wm_detector/static/styles.css ADDED Viewed

	@@ -0,0 +1,68 @@

+body {
+    background-color: #f0f0f0;
+    color: #333;
+    font-family: helvetica, sans-serif;
+    line-height: 1.5;
+    padding: 20px;
+}
+.container {
+    background-color: #fff;
+    border-radius: 5px;
+    padding: 20px;
+    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24);
+    max-width: 1200px;
+    margin: 0 auto;
+}
+h1 {
+    font-size: 24px;
+    margin-bottom: 20px;
+}
+.input-section textarea {
+    width: 100%;
+    padding: 10px;
+    border: 1px solid #ccc;
+    border-radius: 4px;
+    resize: none;
+    font-size: 14px;
+    line-height: 1.5;
+    height: 200px;
+}
+.token-display {
+    margin: 20px 0;
+    padding: 10px;
+    border: 1px solid #ccc;
+    border-radius: 4px;
+    background: #f8f9fa;
+    min-height: 100px;
+    font-size: 14px;
+    line-height: 1.5;
+}
+.token {
+    display: inline-block;
+    padding: 2px 4px;
+    margin: 2px;
+    border-radius: 3px;
+    font-family: monospace;
+}
+.stats-container {
+    margin-top: 20px;
+    display: flex;
+    gap: 30px;
+}
+.stat-value {
+    font-size: 32px;
+    font-weight: 500;
+    color: #333;
+}
+.stat-label {
+    color: #666;
+    font-size: 20px;
+}

wm_detector/templates/index.html ADDED Viewed

	@@ -0,0 +1,66 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>Watermark Detector</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
+    <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
+</head>
+<body>
+    <div class="container">
+        <h1>Interactive watermark detector</h1>
+        <!-- Input Form -->
+        <div class="input-section">
+            <textarea id="user_text"
+                placeholder="Replace this text in the input field to see how watermark detection works."></textarea>
+        </div>
+        <!-- Token Display -->
+        <div class="token-display" id="tokenDisplay"></div>
+        <!-- Statistics -->
+        <div class="stats-container">
+            <div>
+                <div class="stat-value" id="tokenCount">0</div>
+                <div class="stat-label">Tokens</div>
+            </div>
+        </div>
+    </div>
+    <script>
+        let debounceTimeout;
+        const textarea = document.getElementById('user_text');
+        const tokenDisplay = document.getElementById('tokenDisplay');
+        const tokenCount = document.getElementById('tokenCount');
+        async function updateTokenization() {
+            const text = textarea.value;
+            const response = await fetch('/tokenize', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                },
+                body: JSON.stringify({ text: text })
+            });
+            const data = await response.json();
+            // Update token display
+            tokenDisplay.innerHTML = data.tokens.map((token, i) =>
+                `<span class="token" style="background-color: ${data.colors[i]}">${token}</span>`
+            ).join('');
+            // Update counts
+            tokenCount.textContent = data.token_count;
+        }
+        textarea.addEventListener('input', function() {
+            clearTimeout(debounceTimeout);
+            debounceTimeout = setTimeout(updateTokenization, 300);
+        });
+        // Initial tokenization
+        updateTokenization();
+    </script>
+</body>
+</html>

wm_detector/web/__init__.py ADDED Viewed

File without changes

wm_detector/web/app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Main Flask application for the watermark detection web interface.
+"""
+from flask import Flask, render_template, request, jsonify
+from ..core.detector import MarylandDetector, AutoTokenizer
+from .utils import tokenize_text
+def create_app():
+    app = Flask(__name__,
+                static_folder='../static',
+                template_folder='../templates')
+    # Add zip to Jinja's global context
+    app.jinja_env.globals.update(zip=zip)
+    # Minimal setup: pick a detector (example: MarylandDetector)
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="wm_detector/static/hf_cache")
+    detector = MarylandDetector(tokenizer=tokenizer)
+    @app.route("/", methods=["GET"])
+    def index():
+        return render_template("index.html")
+    @app.route("/tokenize", methods=["POST"])
+    def tokenize():
+        data = request.get_json()
+        text = data.get('text', '')
+        if text:
+            tokens, colors, token_ids = tokenize_text(text, tokenizer)
+            return jsonify({
+                'tokens': tokens,
+                'colors': colors,
+                'token_count': len(tokens)
+            })
+        return jsonify({
+            'tokens': [],
+            'colors': [],
+            'token_count': 0
+        })
+    return app
+app = create_app()
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=7860)

wm_detector/web/utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import random
+from ..core.detector import AutoTokenizer
+def generate_pastel_color():
+    """Generate a pastel color in HSL format."""
+    h = random.random()  # Random hue
+    s = 0.3 + random.random() * 0.2  # Saturation between 0.3-0.5
+    l = 0.8 + random.random() * 0.1  # Lightness between 0.8-0.9
+    return f"hsl({h*360}, {s*100}%, {l*100}%)"
+def tokenize_text(text, tokenizer):
+    """Tokenize text and return tokens with display info."""
+    token_ids = tokenizer.encode(text, add_special_tokens=False)
+    # Convert ids to displayable tokens
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens_str = [tokenizer.convert_tokens_to_string([token]) for token in tokens]
+    # Generate pastel colors for each token
+    colors = [generate_pastel_color() for _ in tokens]
+    return tokens_str, colors, token_ids