Upload 9 files

Browse files

Files changed (8) hide show

export.py +30 -0
infer-refined.py +89 -35
infer.py +139 -97
model_code.py +956 -0
model_config.json +9 -0
model_info_initial_only.json +9 -0
model_no_flash.py +195 -0
thresholds.json +170 -0

export.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torchvision.models as models
+from model_code import InitialOnlyImageTagger  # Assume model_code.py classes are accessible
+from safetensors.torch import load_file
+# Load the trained weights (Initial-only model). Adjust path to your weights file.
+#weights_path = "model_initial_only.pt"
+safetensors_path = 'model_initial.safetensors'
+state_dict = load_file(safetensors_path, device='cpu')
+#state_dict = torch.load(weights_path, map_location="cpu")
+# Instantiate the model with the same parameters as training
+model = InitialOnlyImageTagger(total_tags=70527, dataset=None, pretrained=True)  # dataset not needed for forward
+model.load_state_dict(state_dict)
+model.eval()  # set to evaluation mode
+# Define example input – a dummy image tensor of the expected input shape (1, 3, 512, 512)
+dummy_input = torch.randn(1, 3, 512, 512, dtype=torch.float32)
+# Export to ONNX
+onnx_path = "camie_tagger_initial_v15.onnx"
+torch.onnx.export(
+    model, dummy_input, onnx_path,
+    export_params=True,        # store the trained parameter weights in the model file
+    opset_version=13,          # ONNX opset version (13 is widely supported)
+    do_constant_folding=True,  # optimize constant expressions
+    input_names=["input"],
+    output_names=["initial_logits", "refined_logits"],  # model.forward returns two outputs (identical for InitialOnly)
+    dynamic_axes={"input": {0: "batch_size"}}  # allow variable batch size
+)
+print(f"ONNX model saved to: {onnx_path}")

infer-refined.py CHANGED Viewed

@@ -42,73 +42,120 @@ def preprocess_image(img_path, target_size=512, keep_aspect=True):
     arr = np.expand_dims(arr, axis=0)
     return arr
-def onnx_inference(img_paths,
-                   onnx_path="camie_refined_no_flash.onnx",
-                   threshold=0.325,
-                   metadata_file="metadata.json"):
     """
     Loads the ONNX model, runs inference on a list of image paths,
-    and applies an optional threshold to produce final predictions.
     Args:
-      img_paths: List of paths to images.
-      onnx_path: Path to the exported ONNX model file.
-      threshold: Probability threshold for deciding if a tag is predicted.
-      metadata_file: Path to metadata.json that contains idx_to_tag etc.
     Returns:
-      A list of dicts, each containing:
         {
           "initial_logits": np.ndarray of shape (N_tags,),
           "refined_logits": np.ndarray of shape (N_tags,),
-          "predicted_tags": list of tag indices that exceeded threshold,
           ...
         }
-      one dict per input image.
     """
     # 1) Initialize ONNX runtime session
     session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
-    # Optional: for GPU usage, see if "CUDAExecutionProvider" is available
     # session = ort.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"])
     # 2) Pre-load metadata
     with open(metadata_file, "r", encoding="utf-8") as f:
         metadata = json.load(f)
-    idx_to_tag = metadata["idx_to_tag"]  # e.g. { "0": "brown_hair", "1": "blue_eyes", ... }
     # 3) Preprocess each image into a batch
     batch_tensors = []
     for img_path in img_paths:
-        x = preprocess_image(img_path, target_size=512, keep_aspect=True)
         batch_tensors.append(x)
-    # Concatenate along the batch dimension => shape (batch_size, 3, 512, 512)
     batch_input = np.concatenate(batch_tensors, axis=0)
     # 4) Run inference
-    input_name = session.get_inputs()[0].name        # typically "image"
     outputs = session.run(None, {input_name: batch_input})
     # Typically we get [initial_tags, refined_tags] as output
-    initial_preds, refined_preds = outputs  # shapes => (batch_size, 70527)
-    # 5) For each image in batch, convert logits to predictions if desired
     batch_results = []
     for i in range(initial_preds.shape[0]):
-        # Extract one sample's logits
         init_logit = initial_preds[i, :]   # shape (N_tags,)
         ref_logit = refined_preds[i, :]    # shape (N_tags,)
-        # Convert to probabilities with sigmoid
-        ref_prob = 1.0 / (1.0 + np.exp(-ref_logit))
-        # Threshold
-        pred_indices = np.where(ref_prob >= threshold)[0]
         # Build result for this image
         result_dict = {
             "initial_logits": init_logit,
             "refined_logits": ref_logit,
-            "predicted_indices": pred_indices,
-            "predicted_tags": [idx_to_tag[str(idx)] for idx in pred_indices]  # map index->tag name
         }
         batch_results.append(result_dict)
@@ -116,14 +163,21 @@ def onnx_inference(img_paths,
 if __name__ == "__main__":
     # Example usage
-    images = ["image1.jpg", "image2.jpg", "image3.jpg"]
-    results = onnx_inference(images,
-                             onnx_path="camie_refined_no_flash.onnx",
-                             threshold=0.325,
-                             metadata_file="metadata.json")
     for i, res in enumerate(results):
         print(f"Image: {images[i]}")
         print(f"  # of predicted tags above threshold: {len(res['predicted_indices'])}")
-        print(f"  Some predicted tags: {res['predicted_tags'][:10]}  (Show up to 10)")
-        print()

     arr = np.expand_dims(arr, axis=0)
     return arr
+# Example input
+def load_thresholds(threshold_json_path, mode="balanced"):
+    """
+    Loads thresholds from the given JSON file, using a particular mode
+    (e.g. 'balanced', 'high_precision', 'high_recall') for each category.
+    Returns:
+        thresholds_by_category (dict): e.g. { "general": 0.328..., "character": 0.304..., ... }
+        fallback_threshold (float): The overall threshold if category not found
+    """
+    with open(threshold_json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    # The fallback threshold from the "overall" section for the chosen mode
+    fallback_threshold = data["overall"][mode]["threshold"]
+    # Build a dict of thresholds keyed by category
+    thresholds_by_category = {}
+    if "categories" in data:
+        for cat_name, cat_modes in data["categories"].items():
+            # If the chosen mode is present for that category, use it;
+            # otherwise fall back to the "overall" threshold.
+            if mode in cat_modes and "threshold" in cat_modes[mode]:
+                thresholds_by_category[cat_name] = cat_modes[mode]["threshold"]
+            else:
+                thresholds_by_category[cat_name] = fallback_threshold
+    return thresholds_by_category, fallback_threshold
+def onnx_inference(
+    img_paths,
+    onnx_path="camie_refined_no_flash.onnx",
+    metadata_file="metadata.json",
+    threshold_json_path="thresholds.json",
+    mode="balanced",
+    target_size=512,
+    keep_aspect=True
+):
     """
     Loads the ONNX model, runs inference on a list of image paths,
+    and applies category-wise thresholds from threshold.json (per the chosen mode).
     Args:
+      img_paths           : List of paths to images.
+      onnx_path           : Path to the exported ONNX model file.
+      metadata_file       : Path to metadata.json that contains idx_to_tag, tag_to_category, etc.
+      threshold_json_path : Path to thresholds.json containing category-wise threshold info.
+      mode                : "balanced", "high_precision", or "high_recall".
+      target_size         : Final size of preprocessed images (512 by default).
+      keep_aspect         : If True, preserve aspect ratio when resizing, pad with black.
     Returns:
+      A list of dicts, one per input image, each containing:
         {
           "initial_logits": np.ndarray of shape (N_tags,),
           "refined_logits": np.ndarray of shape (N_tags,),
+          "predicted_indices": list of tag indices that exceeded threshold,
+          "predicted_tags": list of predicted tag strings,
           ...
         }
     """
     # 1) Initialize ONNX runtime session
     session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    # For GPU usage, you could do e.g.:
     # session = ort.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"])
     # 2) Pre-load metadata
     with open(metadata_file, "r", encoding="utf-8") as f:
         metadata = json.load(f)
+    idx_to_tag = metadata["idx_to_tag"]          # e.g. { "0": "brown_hair", "1": "blue_eyes", ... }
+    tag_to_category = metadata.get("tag_to_category", {})
+    # Load thresholds from thresholds.json using the specified mode
+    thresholds_by_category, fallback_threshold = load_thresholds(threshold_json_path, mode)
     # 3) Preprocess each image into a batch
     batch_tensors = []
     for img_path in img_paths:
+        x = preprocess_image(img_path, target_size=target_size, keep_aspect=keep_aspect)
         batch_tensors.append(x)
+    # Concatenate along the batch dimension => shape (batch_size, 3, H, W)
     batch_input = np.concatenate(batch_tensors, axis=0)
     # 4) Run inference
+    input_name = session.get_inputs()[0].name  # typically "image" or "input"
     outputs = session.run(None, {input_name: batch_input})
     # Typically we get [initial_tags, refined_tags] as output
+    initial_preds, refined_preds = outputs  # shapes => (batch_size, N_tags)
+    # 5) Convert logits -> probabilities -> apply category-specific thresholds
     batch_results = []
     for i in range(initial_preds.shape[0]):
         init_logit = initial_preds[i, :]   # shape (N_tags,)
         ref_logit = refined_preds[i, :]    # shape (N_tags,)
+        ref_prob  = 1.0 / (1.0 + np.exp(-ref_logit))  # shape (N_tags,)
+        predicted_indices = []
+        predicted_tags = []
+        # Check each tag against the category threshold
+        for idx in range(ref_logit.shape[0]):
+            tag_name = idx_to_tag[str(idx)]  # Convert index->string->tag name
+            category = tag_to_category.get(tag_name, "general")  # fallback to "general" if missing
+            cat_threshold = thresholds_by_category.get(category, fallback_threshold)
+            if ref_prob[idx] >= cat_threshold:
+                predicted_indices.append(idx)
+                predicted_tags.append(tag_name)
         # Build result for this image
         result_dict = {
             "initial_logits": init_logit,
             "refined_logits": ref_logit,
+            "predicted_indices": predicted_indices,
+            "predicted_tags": predicted_tags,
         }
         batch_results.append(result_dict)
 if __name__ == "__main__":
     # Example usage
+    images = ["images.png"]
+    results = onnx_inference(
+        img_paths=images,
+        onnx_path="camie_refined_no_flash_v15.onnx",
+        metadata_file="metadata.json",
+        threshold_json_path="thresholds.json",
+        mode="balanced",     # or "balanced", "high_precision"
+        target_size=512,
+        keep_aspect=True
+    )
     for i, res in enumerate(results):
         print(f"Image: {images[i]}")
         print(f"  # of predicted tags above threshold: {len(res['predicted_indices'])}")
+        # Show first 10 predicted tags (if available)
+        sample_tags = res['predicted_tags']
+        print("  Sample predicted tags:", sample_tags)
+        print()

infer.py CHANGED Viewed

@@ -1,98 +1,140 @@
-import onnxruntime as ort
-import numpy as np
-import json
-from PIL import Image
-# 1) Load ONNX model
-session = ort.InferenceSession("camie_tagger_initial.onnx", providers=["CPUExecutionProvider"])
-# 2) Preprocess your image (512x512, etc.)
-def preprocess_image(img_path):
-    """
-    Loads and resizes an image to 512x512, converts it to float32 [0..1],
-    and returns a (1,3,512,512) NumPy array (NCHW format).
-    """
-    img = Image.open(img_path).convert("RGB").resize((512, 512))
-    x = np.array(img).astype(np.float32) / 255.0
-    x = np.transpose(x, (2, 0, 1))  # HWC -> CHW
-    x = np.expand_dims(x, 0)        # add batch dimension -> (1,3,512,512)
-    return x
-# Example input
-def inference(input_path, output_format="verbose"):
-    """
-    Returns either:
-      - A verbose category breakdown, or
-      - A comma-separated string of predicted tags (underscores replaced with spaces).
-    """
-    # 1) Preprocess
-    input_tensor = preprocess_image(input_path)
-    # 2) Run inference
-    input_name = session.get_inputs()[0].name
-    outputs = session.run(None, {input_name: input_tensor})
-    initial_logits, refined_logits = outputs  # shape: (1, 70527) each
-    # 3) Convert logits to probabilities
-    refined_probs = 1 / (1 + np.exp(-refined_logits))  # shape: (1, 70527)
-    # 4) Load metadata & retrieve threshold info
-    with open("metadata.json", "r", encoding="utf-8") as f:
-        metadata = json.load(f)
-    idx_to_tag = metadata["idx_to_tag"]  # e.g. { "0": "brown_hair", "1": "blue_eyes", ... }
-    tag_to_category = metadata.get("tag_to_category", {})
-    category_thresholds = metadata.get(
-        "category_thresholds",
-        {"artist": 0.1, "character": 0.2, "meta": 0.3, "style": 0.1}
-    )
-    default_threshold = 0.325
-    # 5) Collect predictions by category
-    results_by_category = {}
-    num_tags = refined_probs.shape[1]
-    for i in range(num_tags):
-        prob = float(refined_probs[0, i])
-        tag_name = idx_to_tag[str(i)]  # str(i) because metadata uses string keys
-        category = tag_to_category.get(tag_name, "unknown")
-        cat_threshold = category_thresholds.get(category, default_threshold)
-        if prob >= cat_threshold:
-            if category not in results_by_category:
-                results_by_category[category] = []
-            results_by_category[category].append((tag_name, prob))
-    # 6) Depending on output_format, produce different return strings
-    if output_format == "as_prompt":
-        # Flatten all predicted tags across categories
-        all_predicted_tags = []
-        for cat, tags_list in results_by_category.items():
-            # We only need the tag name in as_prompt format
-            for tname, tprob in tags_list:
-                # convert underscores to spaces
-                tag_name_spaces = tname.replace("_", " ")
-                all_predicted_tags.append(tag_name_spaces)
-        # Create a comma-separated string
-        prompt_string = ", ".join(all_predicted_tags)
-        return prompt_string
-    else:  # "verbose"
-        # We'll build a multiline string describing the predictions
-        lines = []
-        lines.append("Predicted Tags by Category:\n")
-        for cat, tags_list in results_by_category.items():
-            lines.append(f"Category: {cat} | Predicted {len(tags_list)} tags")
-            # Sort descending by probability
-            for tname, tprob in sorted(tags_list, key=lambda x: x[1], reverse=True):
-                lines.append(f"  Tag: {tname:30s}  Prob: {tprob:.4f}")
-            lines.append("")  # blank line after each category
-        # Join lines with newlines
-        verbose_output = "\n".join(lines)
-        return verbose_output
-if __name__ == "__main__":
-    result = inference("path/to/image", output_format="as_prompt")
     print(result)

+import onnxruntime as ort
+import numpy as np
+import json
+from PIL import Image
+# 1) Load ONNX model
+session = ort.InferenceSession("camie_tagger_initial_v15.onnx", providers=["CPUExecutionProvider"])
+# 2) Preprocess your image (512x512, etc.)
+def preprocess_image(img_path):
+    """
+    Loads and resizes an image to 512x512, converts it to float32 [0..1],
+    and returns a (1,3,512,512) NumPy array (NCHW format).
+    """
+    img = Image.open(img_path).convert("RGB").resize((512, 512))
+    x = np.array(img).astype(np.float32) / 255.0
+    x = np.transpose(x, (2, 0, 1))  # HWC -> CHW
+    x = np.expand_dims(x, 0)        # add batch dimension -> (1,3,512,512)
+    return x
+# Example input
+def load_thresholds(threshold_json_path, mode="balanced"):
+    """
+    Loads thresholds from the given JSON file, using a particular mode
+    (e.g. 'balanced', 'high_precision', 'high_recall') for each category.
+    Returns:
+        thresholds_by_category (dict): e.g. { "general": 0.328..., "character": 0.304..., ... }
+        fallback_threshold (float): The overall threshold if category not found
+    """
+    with open(threshold_json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    # The fallback threshold from the "overall" section for the chosen mode
+    fallback_threshold = data["overall"][mode]["threshold"]
+    # Build a dict of thresholds keyed by category
+    thresholds_by_category = {}
+    if "categories" in data:
+        for cat_name, cat_modes in data["categories"].items():
+            # If the chosen mode is present for that category, use it;
+            # otherwise fall back to the "overall" threshold.
+            if mode in cat_modes and "threshold" in cat_modes[mode]:
+                thresholds_by_category[cat_name] = cat_modes[mode]["threshold"]
+            else:
+                thresholds_by_category[cat_name] = fallback_threshold
+    return thresholds_by_category, fallback_threshold
+def inference(
+    input_path,
+    output_format="verbose",
+    mode="balanced",
+    threshold_json_path="thresholds.json",
+    metadata_path="metadata.json"
+):
+    """
+    Run inference on an image using the loaded ONNX model, then apply
+    category-wise thresholds from `threshold.json` for the chosen mode.
+    Arguments:
+        input_path (str)           : Path to the image file for inference.
+        output_format (str)        : Either "verbose" or "as_prompt".
+        mode (str)                 : "balanced", "high_precision", or "high_recall"
+        threshold_json_path (str)  : Path to the JSON file with category thresholds.
+        metadata_path (str)        : Path to the metadata JSON file with category info.
+    Returns:
+        str: The predicted tags in either verbose or comma-separated format.
+    """
+    # 1) Preprocess
+    input_tensor = preprocess_image(input_path)
+    # 2) Run inference
+    input_name = session.get_inputs()[0].name
+    outputs = session.run(None, {input_name: input_tensor})
+    initial_logits, refined_logits = outputs  # shape: (1, 70527) each
+    # 3) Convert logits to probabilities
+    refined_probs = 1 / (1 + np.exp(-refined_logits))  # shape: (1, 70527)
+    # 4) Load metadata & retrieve threshold info
+    with open(metadata_path, "r", encoding="utf-8") as f:
+        metadata = json.load(f)
+    idx_to_tag = metadata["idx_to_tag"]  # e.g. { "0": "brown_hair", "1": "blue_eyes", ... }
+    tag_to_category = metadata.get("tag_to_category", {})
+    # Load thresholds from threshold.json using the specified mode
+    thresholds_by_category, fallback_threshold = load_thresholds(threshold_json_path, mode)
+    # 5) Collect predictions by category
+    results_by_category = {}
+    num_tags = refined_probs.shape[1]
+    for i in range(num_tags):
+        prob = float(refined_probs[0, i])
+        tag_name = idx_to_tag[str(i)]  # str(i) because metadata uses string keys
+        category = tag_to_category.get(tag_name, "general")
+        # Determine the threshold to use for this category
+        cat_threshold = thresholds_by_category.get(category, fallback_threshold)
+        if prob >= cat_threshold:
+            if category not in results_by_category:
+                results_by_category[category] = []
+            results_by_category[category].append((tag_name, prob))
+    # 6) Depending on output_format, produce different return strings
+    if output_format == "as_prompt":
+        # Flatten all predicted tags across categories
+        all_predicted_tags = []
+        for cat, tags_list in results_by_category.items():
+            # We only need the tag name in as_prompt format
+            for tname, tprob in tags_list:
+                # convert underscores to spaces
+                tag_name_spaces = tname.replace("_", " ")
+                all_predicted_tags.append(tag_name_spaces)
+        # Create a comma-separated string
+        prompt_string = ", ".join(all_predicted_tags)
+        return prompt_string
+    else:  # "verbose"
+        # We'll build a multiline string describing the predictions
+        lines = []
+        lines.append("Predicted Tags by Category:\n")
+        for cat, tags_list in results_by_category.items():
+            lines.append(f"Category: {cat} | Predicted {len(tags_list)} tags")
+            # Sort descending by probability
+            for tname, tprob in sorted(tags_list, key=lambda x: x[1], reverse=True):
+                lines.append(f"  Tag: {tname:30s}  Prob: {tprob:.4f}")
+            lines.append("")  # blank line after each category
+        # Join lines with newlines
+        verbose_output = "\n".join(lines)
+        return verbose_output
+if __name__ == "__main__":
+    result = inference("", output_format="as_prompt")
     print(result)

model_code.py ADDED Viewed

	@@ -0,0 +1,956 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import efficientnet_v2_l, EfficientNet_V2_L_Weights
+from PIL import Image
+from typing import Optional
+import torchvision.transforms as transforms
+import os
+import json
+class InitialOnlyImageTagger(nn.Module):
+    """
+    A lightweight version of ImageTagger that only includes the backbone and initial classifier.
+    This model uses significantly less VRAM than the full model.
+    """
+    def __init__(self, total_tags, dataset, model_name='efficientnet_v2_l',
+                 dropout=0.1, pretrained=True):
+        super().__init__()
+        # Debug and stats flags
+        self._flags = {
+            'debug': False,
+            'model_stats': False
+        }
+        # Core model config
+        self.dataset = dataset
+        self.embedding_dim = 1280  # Fixed to EfficientNetV2-L output dimension
+        # Initialize backbone
+        if model_name == 'efficientnet_v2_l':
+            weights = EfficientNet_V2_L_Weights.DEFAULT if pretrained else None
+            self.backbone = efficientnet_v2_l(weights=weights)
+            self.backbone.classifier = nn.Identity()
+        # Spatial pooling only - no projection
+        self.spatial_pool = nn.AdaptiveAvgPool2d((1, 1))
+        # Initial tag prediction with bottleneck
+        self.initial_classifier = nn.Sequential(
+            nn.Linear(self.embedding_dim, self.embedding_dim * 2),
+            nn.LayerNorm(self.embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim),
+            nn.LayerNorm(self.embedding_dim),
+            nn.GELU(),
+            nn.Linear(self.embedding_dim, total_tags)
+        )
+        # Temperature scaling
+        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
+    @property
+    def debug(self):
+        return self._flags['debug']
+    @debug.setter
+    def debug(self, value):
+        self._flags['debug'] = value
+    @property
+    def model_stats(self):
+        return self._flags['model_stats']
+    @model_stats.setter
+    def model_stats(self, value):
+        self._flags['model_stats'] = value
+    def preprocess_image(self, image_path, image_size=512):
+        """Process an image for inference using same preprocessing as training"""
+        if not os.path.exists(image_path):
+            raise ValueError(f"Image not found at path: {image_path}")
+        # Initialize the same transform used during training
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+        ])
+        try:
+            with Image.open(image_path) as img:
+                # Convert RGBA or Palette images to RGB
+                if img.mode in ('RGBA', 'P'):
+                    img = img.convert('RGB')
+                # Get original dimensions
+                width, height = img.size
+                aspect_ratio = width / height
+                # Calculate new dimensions to maintain aspect ratio
+                if aspect_ratio > 1:
+                    new_width = image_size
+                    new_height = int(new_width / aspect_ratio)
+                else:
+                    new_height = image_size
+                    new_width = int(new_height * aspect_ratio)
+                # Resize with LANCZOS filter
+                img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                # Create new image with padding
+                new_image = Image.new('RGB', (image_size, image_size), (0, 0, 0))
+                paste_x = (image_size - new_width) // 2
+                paste_y = (image_size - new_height) // 2
+                new_image.paste(img, (paste_x, paste_y))
+                # Apply transforms (without normalization)
+                img_tensor = transform(new_image)
+                return img_tensor
+        except Exception as e:
+            raise Exception(f"Error processing {image_path}: {str(e)}")
+    def forward(self, x):
+        """Forward pass with only the initial predictions"""
+        # Image Feature Extraction
+        features = self.backbone.features(x)
+        features = self.spatial_pool(features).squeeze(-1).squeeze(-1)
+        # Initial Tag Predictions
+        initial_logits = self.initial_classifier(features)
+        initial_preds = torch.clamp(initial_logits / self.temperature, min=-15.0, max=15.0)
+        # For API compatibility with the full model, return the same predictions twice
+        return initial_preds, initial_preds
+    def predict(self, image_path, threshold=0.325, category_thresholds=None):
+        """
+        Run inference on an image with support for category-specific thresholds.
+        """
+        # Preprocess the image
+        img_tensor = self.preprocess_image(image_path).unsqueeze(0)
+        # Move to the same device as model and convert to half precision
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype  # Match model's precision
+        img_tensor = img_tensor.to(device, dtype=dtype)
+        # Run inference
+        with torch.no_grad():
+            initial_preds, _ = self.forward(img_tensor)
+            # Apply sigmoid to get probabilities
+            initial_probs = torch.sigmoid(initial_preds)
+            # Apply thresholds
+            if category_thresholds:
+                # Create binary prediction tensors
+                initial_binary = torch.zeros_like(initial_probs)
+                # Apply thresholds by category
+                for category, cat_threshold in category_thresholds.items():
+                    # Create a mask for tags in this category
+                    category_mask = torch.zeros_like(initial_probs, dtype=torch.bool)
+                    # Find indices for this category
+                    for tag_idx in range(initial_probs.size(-1)):
+                        try:
+                            _, tag_category = self.dataset.get_tag_info(tag_idx)
+                            if tag_category == category:
+                                category_mask[:, tag_idx] = True
+                        except:
+                            continue
+                    # Apply threshold only to tags in this category
+                    cat_threshold_tensor = torch.tensor(cat_threshold, device=device, dtype=dtype)
+                    initial_binary[category_mask] = (initial_probs[category_mask] >= cat_threshold_tensor).to(dtype)
+                predictions = initial_binary
+            else:
+                # Use the same threshold for all tags
+                threshold_tensor = torch.tensor(threshold, device=device, dtype=dtype)
+                predictions = (initial_probs >= threshold_tensor).to(dtype)
+            # Return the same probabilities for both initial and refined for API compatibility
+            return {
+                'initial_probabilities': initial_probs,
+                'refined_probabilities': initial_probs,  # Same as initial for compatibility
+                'predictions': predictions
+            }
+    def get_tags_from_predictions(self, predictions, include_probabilities=True):
+        """
+        Convert model predictions to human-readable tags grouped by category.
+        """
+        # Get non-zero predictions
+        if predictions.dim() > 1:
+            predictions = predictions[0]  # Remove batch dimension
+        # Get indices of positive predictions
+        indices = torch.where(predictions > 0)[0].cpu().tolist()
+        # Group by category
+        result = {}
+        for idx in indices:
+            tag_name, category = self.dataset.get_tag_info(idx)
+            if category not in result:
+                result[category] = []
+            if include_probabilities:
+                prob = predictions[idx].item()
+                result[category].append((tag_name, prob))
+            else:
+                result[category].append(tag_name)
+        # Sort tags by probability within each category
+        if include_probabilities:
+            for category in result:
+                result[category] = sorted(result[category], key=lambda x: x[1], reverse=True)
+        return result
+class FlashAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, dropout=0.1, batch_first=True):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.head_dim = dim // num_heads
+        assert self.head_dim * num_heads == dim, "dim must be divisible by num_heads"
+        self.q_proj = nn.Linear(dim, dim, bias=False)
+        self.k_proj = nn.Linear(dim, dim, bias=False)
+        self.v_proj = nn.Linear(dim, dim, bias=False)
+        self.out_proj = nn.Linear(dim, dim, bias=False)
+        for proj in [self.q_proj, self.k_proj, self.v_proj, self.out_proj]:
+            nn.init.xavier_uniform_(proj.weight, gain=0.1)
+        self.scale = self.head_dim ** -0.5
+        self.debug = False
+    def _debug_print(self, name, tensor):
+        """Debug helper"""
+        if self.debug:
+            print(f"\n{name}:")
+            print(f"Shape: {tensor.shape}")
+            print(f"Device: {tensor.device}")
+            print(f"Dtype: {tensor.dtype}")
+            if tensor.is_floating_point():
+                with torch.no_grad():
+                    print(f"Range: [{tensor.min().item():.3f}, {tensor.max().item():.3f}]")
+                    print(f"Mean: {tensor.mean().item():.3f}")
+                    print(f"Std: {tensor.std().item():.3f}")
+    def _reshape_for_flash(self, x: torch.Tensor) -> torch.Tensor:
+        """Reshape input tensor for flash attention format"""
+        batch_size, seq_len, _ = x.size()
+        x = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        x = x.transpose(1, 2)  # [B, H, S, D]
+        return x.contiguous()
+    def forward(self, query: torch.Tensor, key: Optional[torch.Tensor] = None,
+                value: Optional[torch.Tensor] = None,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Forward pass with flash attention"""
+        if self.debug:
+            print("\nFlashAttention Forward Pass")
+        batch_size = query.size(0)
+        # Use query as key/value if not provided
+        key = query if key is None else key
+        value = query if value is None else value
+        # Project inputs
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        if self.debug:
+            self._debug_print("Query before reshape", q)
+        # Reshape for attention [B, H, S, D]
+        q = self._reshape_for_flash(q)
+        k = self._reshape_for_flash(k)
+        v = self._reshape_for_flash(v)
+        if self.debug:
+            self._debug_print("Query after reshape", q)
+        # Handle masking
+        if mask is not None:
+            # First convert mask to proper shape based on input dimensionality
+            if mask.dim() == 2:  # [B, S]
+                mask = mask.view(batch_size, 1, -1, 1)
+            elif mask.dim() == 3:  # [B, S, S]
+                mask = mask.view(batch_size, 1, mask.size(1), mask.size(2))
+            elif mask.dim() == 5:  # [B, 1, S, S, S]
+                mask = mask.squeeze(1).view(batch_size, 1, mask.size(2), mask.size(3))
+            # Ensure mask is float16 if we're using float16
+            mask = mask.to(q.dtype)
+            if self.debug:
+                self._debug_print("Prepared mask", mask)
+                print(f"q shape: {q.shape}, mask shape: {mask.shape}")
+            # Create attention mask that covers the full sequence length
+            seq_len = q.size(2)
+            if mask.size(-1) != seq_len:
+                # Pad or trim mask to match sequence length
+                new_mask = torch.zeros(batch_size, 1, seq_len, seq_len,
+                                    device=mask.device, dtype=mask.dtype)
+                min_len = min(seq_len, mask.size(-1))
+                new_mask[..., :min_len, :min_len] = mask[..., :min_len, :min_len]
+                mask = new_mask
+            # Create key padding mask
+            key_padding_mask = mask.squeeze(1).sum(-1) > 0
+            key_padding_mask = key_padding_mask.view(batch_size, 1, -1, 1)
+            # Apply the key padding mask
+            k = k * key_padding_mask
+            v = v * key_padding_mask
+        if self.debug:
+            self._debug_print("Query before attention", q)
+            self._debug_print("Key before attention", k)
+            self._debug_print("Value before attention", v)
+        # Run flash attention
+        dropout_p = self.dropout if self.training else 0.0
+        output = flash_attn_func(
+            q, k, v,
+            dropout_p=dropout_p,
+            softmax_scale=self.scale,
+            causal=False
+        )
+        if self.debug:
+            self._debug_print("Output after attention", output)
+        # Reshape output [B, H, S, D] -> [B, S, H, D] -> [B, S, D]
+        output = output.transpose(1, 2).contiguous()
+        output = output.view(batch_size, -1, self.dim)
+        # Final projection
+        output = self.out_proj(output)
+        if self.debug:
+            self._debug_print("Final output", output)
+        return output
+class OptimizedTagEmbedding(nn.Module):
+    def __init__(self, num_tags, embedding_dim, num_heads=8, dropout=0.1):
+        super().__init__()
+        # Single shared embedding for all tags
+        self.embedding = nn.Embedding(num_tags, embedding_dim)
+        self.attention = FlashAttention(embedding_dim, num_heads, dropout)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        # Single importance weighting for all tags
+        self.tag_importance = nn.Parameter(torch.ones(num_tags) * 0.1)
+        # Projection layers for unified tag context
+        self.context_proj = nn.Sequential(
+            nn.Linear(embedding_dim, embedding_dim * 2),
+            nn.LayerNorm(embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(embedding_dim * 2, embedding_dim),
+            nn.LayerNorm(embedding_dim)
+        )
+        self.importance_scale = nn.Parameter(torch.tensor(0.1))
+        self.context_scale = nn.Parameter(torch.tensor(1.0))
+        self.debug = False
+    def _debug_print(self, name, tensor, extra_info=None):
+        """Memory efficient debug printing with type handling"""
+        if self.debug:
+            print(f"\n{name}:")
+            print(f"- Shape: {tensor.shape}")
+            if isinstance(tensor, torch.Tensor):
+                with torch.no_grad():
+                    print(f"- Device: {tensor.device}")
+                    print(f"- Dtype: {tensor.dtype}")
+                    # Convert to float32 for statistics if needed
+                    if tensor.dtype not in [torch.float16, torch.float32, torch.float64]:
+                        calc_tensor = tensor.float()
+                    else:
+                        calc_tensor = tensor
+                    try:
+                        min_val = calc_tensor.min().item()
+                        max_val = calc_tensor.max().item()
+                        mean_val = calc_tensor.mean().item()
+                        std_val = calc_tensor.std().item()
+                        norm_val = torch.norm(calc_tensor).item()
+                        print(f"- Value range: [{min_val:.3f}, {max_val:.3f}]")
+                        print(f"- Mean: {mean_val:.3f}")
+                        print(f"- Std: {std_val:.3f}")
+                        print(f"- L2 Norm: {norm_val:.3f}")
+                        if extra_info:
+                            print(f"- Additional info: {extra_info}")
+                    except Exception as e:
+                        print(f"- Could not compute statistics: {str(e)}")
+    def _debug_tensor(self, name, tensor):
+        """Debug helper with dtype-specific analysis"""
+        if self.debug and isinstance(tensor, torch.Tensor):
+            print(f"\n{name}:")
+            print(f"- Shape: {tensor.shape}")
+            print(f"- Device: {tensor.device}")
+            print(f"- Dtype: {tensor.dtype}")
+            with torch.no_grad():
+                has_nan = torch.isnan(tensor).any().item() if tensor.is_floating_point() else False
+                has_inf = torch.isinf(tensor).any().item() if tensor.is_floating_point() else False
+                print(f"- Contains NaN: {has_nan}")
+                print(f"- Contains Inf: {has_inf}")
+                # Different stats for different dtypes
+                if tensor.is_floating_point():
+                    print(f"- Range: [{tensor.min().item():.3f}, {tensor.max().item():.3f}]")
+                    print(f"- Mean: {tensor.mean().item():.3f}")
+                    print(f"- Std: {tensor.std().item():.3f}")
+                else:
+                    # For integer tensors
+                    print(f"- Range: [{tensor.min().item()}, {tensor.max().item()}]")
+                    print(f"- Unique values: {tensor.unique().numel()}")
+    def _process_category(self, indices, masks):
+        """Process a single category of tags"""
+        # Get embeddings for this category
+        embeddings = self.embedding(indices)
+        if self.debug:
+            self._debug_tensor("Category embeddings", embeddings)
+        # Apply importance weights
+        importance = torch.sigmoid(self.tag_importance) * self.importance_scale
+        importance = torch.clamp(importance, min=0.01, max=10.0)
+        importance_weights = importance[indices].unsqueeze(-1)
+        # Apply and normalize
+        embeddings = embeddings * importance_weights
+        embeddings = self.norm1(embeddings)
+        # Apply attention if we have more than one tag
+        if embeddings.size(1) > 1:
+            if masks is not None:
+                attention_mask = torch.einsum('bi,bj->bij', masks, masks)
+                attended = self.attention(embeddings, mask=attention_mask)
+            else:
+                attended = self.attention(embeddings)
+            embeddings = self.norm2(attended)
+        # Pool embeddings with masking
+        if masks is not None:
+            masked_embeddings = embeddings * masks.unsqueeze(-1)
+            pooled = masked_embeddings.sum(dim=1) / masks.sum(dim=1, keepdim=True).clamp(min=1.0)
+        else:
+            pooled = embeddings.mean(dim=1)
+        return pooled, embeddings
+    def forward(self, tag_indices_dict, tag_masks_dict=None):
+        """
+        Process all tags in a unified embedding space
+        Args:
+            tag_indices_dict: dict of {category: tensor of indices}
+            tag_masks_dict: dict of {category: tensor of masks}
+        """
+        if self.debug:
+            print("\nOptimizedTagEmbedding Forward Pass")
+        # Concatenate all indices and masks
+        all_indices = []
+        all_masks = []
+        batch_size = None
+        for category, indices in tag_indices_dict.items():
+            if batch_size is None:
+                batch_size = indices.size(0)
+            all_indices.append(indices)
+            if tag_masks_dict:
+                all_masks.append(tag_masks_dict[category])
+        # Stack along sequence dimension
+        combined_indices = torch.cat(all_indices, dim=1)  # [B, total_seq_len]
+        if tag_masks_dict:
+            combined_masks = torch.cat(all_masks, dim=1)  # [B, total_seq_len]
+        if self.debug:
+            self._debug_tensor("Combined indices", combined_indices)
+            if tag_masks_dict:
+                self._debug_tensor("Combined masks", combined_masks)
+        # Get embeddings for all tags using shared embedding
+        embeddings = self.embedding(combined_indices)  # [B, total_seq_len, D]
+        if self.debug:
+            self._debug_tensor("Base embeddings", embeddings)
+        # Apply unified importance weighting
+        importance = torch.sigmoid(self.tag_importance) * self.importance_scale
+        importance = torch.clamp(importance, min=0.01, max=10.0)
+        importance_weights = importance[combined_indices].unsqueeze(-1)
+        # Apply and normalize importance weights
+        embeddings = embeddings * importance_weights
+        embeddings = self.norm1(embeddings)
+        if self.debug:
+            self._debug_tensor("Weighted embeddings", embeddings)
+        # Apply attention across all tags together
+        if tag_masks_dict:
+            attention_mask = torch.einsum('bi,bj->bij', combined_masks, combined_masks)
+            attended = self.attention(embeddings, mask=attention_mask)
+        else:
+            attended = self.attention(embeddings)
+        attended = self.norm2(attended)
+        if self.debug:
+            self._debug_tensor("Attended embeddings", attended)
+        # Global pooling with masking
+        if tag_masks_dict:
+            masked_embeddings = attended * combined_masks.unsqueeze(-1)
+            tag_context = masked_embeddings.sum(dim=1) / combined_masks.sum(dim=1, keepdim=True).clamp(min=1.0)
+        else:
+            tag_context = attended.mean(dim=1)
+        # Project and scale context
+        tag_context = self.context_proj(tag_context)
+        context_scale = torch.clamp(self.context_scale, min=0.1, max=10.0)
+        tag_context = tag_context * context_scale
+        if self.debug:
+            self._debug_tensor("Final tag context", tag_context)
+        return tag_context, attended
+class TagDataset:
+    """Lightweight dataset wrapper for inference only"""
+    def __init__(self, total_tags, idx_to_tag, tag_to_category):
+        self.total_tags = total_tags
+        self.idx_to_tag = idx_to_tag if isinstance(idx_to_tag, dict) else {int(k): v for k, v in idx_to_tag.items()}
+        self.tag_to_category = tag_to_category
+    def get_tag_info(self, idx):
+        """Get tag name and category for a given index"""
+        tag_name = self.idx_to_tag.get(idx, f"unknown-{idx}")
+        category = self.tag_to_category.get(tag_name, "general")
+        return tag_name, category
+class ImageTagger(nn.Module):
+    def __init__(self, total_tags, dataset, model_name='efficientnet_v2_l',
+                 num_heads=16, dropout=0.1, pretrained=True,
+                 tag_context_size=256):
+        super().__init__()
+        # Debug and stats flags
+        self._flags = {
+            'debug': False,
+            'model_stats': False
+        }
+        # Core model config
+        self.dataset = dataset
+        self.tag_context_size = tag_context_size
+        self.embedding_dim = 1280  # Fixed to EfficientNetV2-L output dimension
+        # Initialize backbone
+        if model_name == 'efficientnet_v2_l':
+            weights = EfficientNet_V2_L_Weights.DEFAULT if pretrained else None
+            self.backbone = efficientnet_v2_l(weights=weights)
+            self.backbone.classifier = nn.Identity()
+        # Spatial pooling only - no projection
+        self.spatial_pool = nn.AdaptiveAvgPool2d((1, 1))
+        # Initial tag prediction with bottleneck
+        self.initial_classifier = nn.Sequential(
+            nn.Linear(self.embedding_dim, self.embedding_dim * 2),
+            nn.LayerNorm(self.embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim),
+            nn.LayerNorm(self.embedding_dim),
+            nn.GELU(),
+            nn.Linear(self.embedding_dim, total_tags)
+        )
+        # Tag embeddings at full dimension
+        self.tag_embedding = nn.Embedding(total_tags, self.embedding_dim)
+        self.tag_attention = FlashAttention(self.embedding_dim, num_heads, dropout)
+        self.tag_norm = nn.LayerNorm(self.embedding_dim)
+        # Improved cross attention projection
+        self.cross_proj = nn.Sequential(
+            nn.Linear(self.embedding_dim, self.embedding_dim * 2),
+            nn.LayerNorm(self.embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim)
+        )
+        # Cross attention at full dimension
+        self.cross_attention = FlashAttention(self.embedding_dim, num_heads, dropout)
+        self.cross_norm = nn.LayerNorm(self.embedding_dim)
+        # Refined classifier with improved bottleneck
+        self.refined_classifier = nn.Sequential(
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim * 2),  # Doubled input size for residual
+            nn.LayerNorm(self.embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim),
+            nn.LayerNorm(self.embedding_dim),
+            nn.GELU(),
+            nn.Linear(self.embedding_dim, total_tags)
+        )
+        # Temperature scaling
+        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
+    def _get_selected_tags(self, logits):
+        """Select top-K tags based on prediction confidence"""
+        # Apply sigmoid to get probabilities
+        probs = torch.sigmoid(logits)
+        # Get top-K predictions for each image in batch
+        batch_size = logits.size(0)
+        topk_values, topk_indices = torch.topk(
+            probs, k=self.tag_context_size, dim=1, largest=True, sorted=True
+        )
+        return topk_indices, topk_values
+    @property
+    def debug(self):
+        return self._flags['debug']
+    @debug.setter
+    def debug(self, value):
+        self._flags['debug'] = value
+    @property
+    def model_stats(self):
+        return self._flags['model_stats']
+    @model_stats.setter
+    def model_stats(self, value):
+        self._flags['model_stats'] = value
+    def preprocess_image(self, image_path, image_size=512):
+        """Process an image for inference using same preprocessing as training"""
+        if not os.path.exists(image_path):
+            raise ValueError(f"Image not found at path: {image_path}")
+        # Initialize the same transform used during training
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+        ])
+        try:
+            with Image.open(image_path) as img:
+                # Convert RGBA or Palette images to RGB
+                if img.mode in ('RGBA', 'P'):
+                    img = img.convert('RGB')
+                # Get original dimensions
+                width, height = img.size
+                aspect_ratio = width / height
+                # Calculate new dimensions to maintain aspect ratio
+                if aspect_ratio > 1:
+                    new_width = image_size
+                    new_height = int(new_width / aspect_ratio)
+                else:
+                    new_height = image_size
+                    new_width = int(new_height * aspect_ratio)
+                # Resize with LANCZOS filter
+                img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                # Create new image with padding
+                new_image = Image.new('RGB', (image_size, image_size), (0, 0, 0))
+                paste_x = (image_size - new_width) // 2
+                paste_y = (image_size - new_height) // 2
+                new_image.paste(img, (paste_x, paste_y))
+                # Apply transforms (without normalization)
+                img_tensor = transform(new_image)
+                return img_tensor
+        except Exception as e:
+            raise Exception(f"Error processing {image_path}: {str(e)}")
+    def forward(self, x):
+        """Forward pass with simplified feature handling"""
+        # Initialize tracking dicts
+        model_stats = {} if self.model_stats else {}
+        debug_tensors = {} if self.debug else None
+        # 1. Image Feature Extraction
+        features = self.backbone.features(x)
+        features = self.spatial_pool(features).squeeze(-1).squeeze(-1)
+        # 2. Initial Tag Predictions
+        initial_logits = self.initial_classifier(features)
+        initial_preds = torch.clamp(initial_logits / self.temperature, min=-15.0, max=15.0)
+        # 3. Tag Selection & Embedding (simplified)
+        pred_tag_indices, _ = self._get_selected_tags(initial_preds)
+        tag_embeddings = self.tag_embedding(pred_tag_indices)
+        # 4. Self-Attention on Tags
+        attended_tags = self.tag_attention(tag_embeddings)
+        attended_tags = self.tag_norm(attended_tags)
+        # 5. Cross-Attention between Features and Tags
+        features_proj = self.cross_proj(features)
+        features_expanded = features_proj.unsqueeze(1).expand(-1, self.tag_context_size, -1)
+        cross_attended = self.cross_attention(features_expanded, attended_tags)
+        cross_attended = self.cross_norm(cross_attended)
+        # 6. Feature Fusion with Residual Connection
+        fused_features = cross_attended.mean(dim=1)  # Average across tag dimension
+        # Concatenate original and attended features
+        combined_features = torch.cat([features, fused_features], dim=-1)
+        # 7. Refined Predictions
+        refined_logits = self.refined_classifier(combined_features)
+        refined_preds = torch.clamp(refined_logits / self.temperature, min=-15.0, max=15.0)
+        # Return both prediction sets
+        return initial_preds, refined_preds
+    def predict(self, image_path, threshold=0.325, category_thresholds=None):
+        """
+        Run inference on an image with support for category-specific thresholds.
+        """
+        # Preprocess the image
+        img_tensor = self.preprocess_image(image_path).unsqueeze(0)
+        # Move to the same device as model and convert to half precision
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype  # Match model's precision
+        img_tensor = img_tensor.to(device, dtype=dtype)
+        # Run inference
+        with torch.no_grad():
+            initial_preds, refined_preds = self.forward(img_tensor)
+            # Apply sigmoid to get probabilities
+            initial_probs = torch.sigmoid(initial_preds)
+            refined_probs = torch.sigmoid(refined_preds)
+            # Apply thresholds
+            if category_thresholds:
+                # Create binary prediction tensors
+                refined_binary = torch.zeros_like(refined_probs)
+                # Apply thresholds by category
+                for category, cat_threshold in category_thresholds.items():
+                    # Create a mask for tags in this category
+                    category_mask = torch.zeros_like(refined_probs, dtype=torch.bool)
+                    # Find indices for this category
+                    for tag_idx in range(refined_probs.size(-1)):
+                        try:
+                            _, tag_category = self.dataset.get_tag_info(tag_idx)
+                            if tag_category == category:
+                                category_mask[:, tag_idx] = True
+                        except:
+                            continue
+                    # Apply threshold only to tags in this category - ensure dtype consistency
+                    cat_threshold_tensor = torch.tensor(cat_threshold, device=device, dtype=dtype)
+                    refined_binary[category_mask] = (refined_probs[category_mask] >= cat_threshold_tensor).to(dtype)
+                predictions = refined_binary
+            else:
+                # Use the same threshold for all tags
+                threshold_tensor = torch.tensor(threshold, device=device, dtype=dtype)
+                predictions = (refined_probs >= threshold_tensor).to(dtype)
+            # Return both probabilities and thresholded predictions
+            return {
+                'initial_probabilities': initial_probs,
+                'refined_probabilities': refined_probs,
+                'predictions': predictions
+            }
+    def get_tags_from_predictions(self, predictions, include_probabilities=True):
+        """
+        Convert model predictions to human-readable tags grouped by category.
+        """
+        # Get non-zero predictions
+        if predictions.dim() > 1:
+            predictions = predictions[0]  # Remove batch dimension
+        # Get indices of positive predictions
+        indices = torch.where(predictions > 0)[0].cpu().tolist()
+        # Group by category
+        result = {}
+        for idx in indices:
+            tag_name, category = self.dataset.get_tag_info(idx)
+            if category not in result:
+                result[category] = []
+            if include_probabilities:
+                prob = predictions[idx].item()
+                result[category].append((tag_name, prob))
+            else:
+                result[category].append(tag_name)
+        # Sort tags by probability within each category
+        if include_probabilities:
+            for category in result:
+                result[category] = sorted(result[category], key=lambda x: x[1], reverse=True)
+        return result
+def load_model(model_dir, device='cuda'):
+    """Load model with better error handling and warnings"""
+    print(f"Loading model from {model_dir}")
+    try:
+        # Load metadata
+        metadata_path = os.path.join(model_dir, "metadata.json")
+        if not os.path.exists(metadata_path):
+            raise FileNotFoundError(f"Metadata file not found at {metadata_path}")
+        with open(metadata_path, 'r') as f:
+            metadata = json.load(f)
+        # Load model info
+        model_info_path = os.path.join(model_dir, "model_info_initial_only.json")
+        if os.path.exists(model_info_path):
+            with open(model_info_path, 'r') as f:
+                model_info = json.load(f)
+        else:
+            print("WARNING: Model info file not found, using default settings")
+            model_info = {
+                "tag_context_size": 256,
+                "num_heads": 16,
+                "precision": "float16"
+            }
+        # Create dataset wrapper
+        dataset = TagDataset(
+            total_tags=metadata['total_tags'],
+            idx_to_tag=metadata['idx_to_tag'],
+            tag_to_category=metadata['tag_to_category']
+        )
+        # Initialize model with exact settings from model_info
+        model = ImageTagger(
+            total_tags=metadata['total_tags'],
+            dataset=dataset,
+            num_heads=model_info.get('num_heads', 16),
+            tag_context_size=model_info.get('tag_context_size', 256),
+            pretrained=False
+        )
+        # Load weights
+        state_dict_path = os.path.join(model_dir, "model.pt")
+        if not os.path.exists(state_dict_path):
+            raise FileNotFoundError(f"Model state dict not found at {state_dict_path}")
+        state_dict = torch.load(state_dict_path, map_location=device)
+        # First try strict loading
+        try:
+            model.load_state_dict(state_dict, strict=True)
+            print("✓ Model state dict loaded with strict=True successfully")
+        except Exception as e:
+            print(f"! Strict loading failed: {str(e)}")
+            print("Attempting non-strict loading...")
+            # Try non-strict loading
+            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+            print(f"Non-strict loading completed with:")
+            print(f"- {len(missing_keys)} missing keys")
+            print(f"- {len(unexpected_keys)} unexpected keys")
+            if len(missing_keys) > 0:
+                print(f"Sample missing keys: {missing_keys[:5]}")
+            if len(unexpected_keys) > 0:
+                print(f"Sample unexpected keys: {unexpected_keys[:5]}")
+        # Move model to device
+        model = model.to(device)
+        # Set to half precision if needed
+        if model_info.get('precision') == 'float16':
+            model = model.half()
+            print("✓ Model converted to half precision")
+        # Set to eval mode
+        model.eval()
+        print("✓ Model set to evaluation mode")
+        # Verify parameter dtype
+        param_dtype = next(model.parameters()).dtype
+        print(f"✓ Model loaded with precision: {param_dtype}")
+        return model, dataset
+    except Exception as e:
+        print(f"ERROR loading model: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise
+# Example usage
+if __name__ == "__main__":
+    import sys
+    # Get model directory from command line or use default
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "./exported_model"
+    # Load model
+    model, dataset, thresholds = load_model(model_dir)
+    # Display info
+    print(f"\nModel information:")
+    print(f"  Total tags: {dataset.total_tags}")
+    print(f"  Device: {next(model.parameters()).device}")
+    print(f"  Precision: {next(model.parameters()).dtype}")
+    # Test on an image if provided
+    if len(sys.argv) > 2:
+        image_path = sys.argv[2]
+        print(f"\nRunning inference on {image_path}")
+        # Use category thresholds if available
+        if thresholds and 'categories' in thresholds:
+            category_thresholds = {cat: opt['balanced']['threshold']
+                              for cat, opt in thresholds['categories'].items()}
+            results = model.predict(image_path, category_thresholds=category_thresholds)
+        else:
+            results = model.predict(image_path)
+        # Get tags
+        tags = model.get_tags_from_predictions(results['predictions'])
+        # Print tags by category
+        print("\nPredicted tags:")
+        for category, category_tags in tags.items():
+            print(f"\n{category.capitalize()}:")
+            for tag, prob in category_tags:
+                print(f"  {tag}: {prob:.3f}")

model_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "class_name": "ImageTagger",
+  "args": {
+    "total_tags": 70527,
+    "num_heads": 16,
+    "dropout": 0.1,
+    "tag_context_size": 256
+  }
+}

model_info_initial_only.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "precision": "float16",
+  "tag_context_size": 256,
+  "num_heads": 16,
+  "architecture": "ImageTagger",
+  "embedding_dim": 1280,
+  "backbone": "efficientnet_v2_l",
+  "model_type": "initial_only"
+}

model_no_flash.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import efficientnet_v2_l, EfficientNet_V2_L_Weights
+class MultiheadAttentionNoFlash(nn.Module):
+    """Custom multi-head attention module (replaces FlashAttention) using ONNX-friendly ops."""
+    def __init__(self, dim, num_heads=8, dropout=0.0):
+        super().__init__()
+        assert dim % num_heads == 0, "Embedding dim must be divisible by num_heads"
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5  # scaling factor for dot-product attention
+        # Define separate projections for query, key, value, and output (no biases to match FlashAttention)
+        self.q_proj = nn.Linear(dim, dim, bias=False)
+        self.k_proj = nn.Linear(dim, dim, bias=False)
+        self.v_proj = nn.Linear(dim, dim, bias=False)
+        self.out_proj = nn.Linear(dim, dim, bias=False)
+        # (Note: We omit dropout in attention computation for ONNX simplicity; model should be set to eval mode anyway.)
+    def forward(self, query, key=None, value=None):
+        # Allow usage as self-attention if key/value not provided
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        # Linear projections
+        Q = self.q_proj(query)   # [B, S_q, dim]
+        K = self.k_proj(key)     # [B, S_k, dim]
+        V = self.v_proj(value)   # [B, S_v, dim]
+        # Reshape into (B, num_heads, S, head_dim) for computing attention per head
+        B, S_q, _ = Q.shape
+        _, S_k, _ = K.shape
+        Q = Q.view(B, S_q, self.num_heads, self.head_dim).transpose(1, 2)  # [B, heads, S_q, head_dim]
+        K = K.view(B, S_k, self.num_heads, self.head_dim).transpose(1, 2)  # [B, heads, S_k, head_dim]
+        V = V.view(B, S_k, self.num_heads, self.head_dim).transpose(1, 2)  # [B, heads, S_k, head_dim]
+        # Scaled dot-product attention: compute attention weights
+        attn_weights = torch.matmul(Q, K.transpose(2, 3))  # [B, heads, S_q, S_k]
+        attn_weights = attn_weights * self.scale
+        attn_probs = F.softmax(attn_weights, dim=-1)       # softmax over S_k (key length)
+        # Apply attention weights to values
+        attn_output = torch.matmul(attn_probs, V)  # [B, heads, S_q, head_dim]
+        # Reshape back to [B, S_q, dim]
+        attn_output = attn_output.transpose(1, 2).contiguous().view(B, S_q, self.dim)
+        # Output projection
+        output = self.out_proj(attn_output)  # [B, S_q, dim]
+        return output
+class ImageTaggerRefinedONNX(nn.Module):
+    """
+    Refined CAMIE Image Tagger model without FlashAttention.
+    - EfficientNetV2 backbone
+    - Initial classifier for preliminary tag logits
+    - Multi-head self-attention on top predicted tag embeddings
+    - Multi-head cross-attention between image feature and tag embeddings
+    - Refined classifier for final tag logits
+    """
+    def __init__(self, total_tags, tag_context_size=256, num_heads=16, dropout=0.1):
+        super().__init__()
+        self.tag_context_size = tag_context_size
+        self.embedding_dim = 1280  # EfficientNetV2-L feature dimension
+        # Backbone feature extractor (EfficientNetV2-L)
+        backbone = efficientnet_v2_l(weights=EfficientNet_V2_L_Weights.DEFAULT)
+        backbone.classifier = nn.Identity()  # remove final classification head
+        self.backbone = backbone
+        # Spatial pooling to get a single feature vector per image (1x1 avg pool)
+        self.spatial_pool = nn.AdaptiveAvgPool2d((1, 1))
+        # Initial classifier (two-layer MLP) to predict tags from image feature
+        self.initial_classifier = nn.Sequential(
+            nn.Linear(self.embedding_dim, self.embedding_dim * 2),
+            nn.LayerNorm(self.embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim),
+            nn.LayerNorm(self.embedding_dim),
+            nn.GELU(),
+            nn.Linear(self.embedding_dim, total_tags)  # outputs raw logits for all tags
+        )
+        # Embedding for tags (each tag gets an embedding vector, used for attention)
+        self.tag_embedding = nn.Embedding(total_tags, self.embedding_dim)
+        # Self-attention over the selected tag embeddings (replaces FlashAttention)
+        self.tag_attention = MultiheadAttentionNoFlash(self.embedding_dim, num_heads=num_heads, dropout=dropout)
+        self.tag_norm = nn.LayerNorm(self.embedding_dim)
+        # Projection from image feature to query vector for cross-attention
+        self.cross_proj = nn.Sequential(
+            nn.Linear(self.embedding_dim, self.embedding_dim * 2),
+            nn.LayerNorm(self.embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim)
+        )
+        # Cross-attention between image feature (as query) and tag features (as key/value)
+        self.cross_attention = MultiheadAttentionNoFlash(self.embedding_dim, num_heads=num_heads, dropout=dropout)
+        self.cross_norm = nn.LayerNorm(self.embedding_dim)
+        # Refined classifier (takes concatenated original & attended features)
+        self.refined_classifier = nn.Sequential(
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim * 2),
+            nn.LayerNorm(self.embedding_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.embedding_dim * 2, self.embedding_dim),
+            nn.LayerNorm(self.embedding_dim),
+            nn.GELU(),
+            nn.Linear(self.embedding_dim, total_tags)
+        )
+        # Temperature parameter for scaling logits (to calibrate confidence)
+        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
+    def forward(self, images):
+        # 1. Feature extraction
+        feats = self.backbone.features(images)               # [B, 1280, H/32, W/32] features
+        feats = self.spatial_pool(feats).squeeze(-1).squeeze(-1)  # [B, 1280] global feature vector per image
+        # 2. Initial tag prediction
+        initial_logits = self.initial_classifier(feats)      # [B, total_tags]
+        # Scale by temperature and clamp (to stabilize extreme values, as in original)
+        initial_preds = torch.clamp(initial_logits / self.temperature, min=-15.0, max=15.0)
+        # 3. Select top-k predicted tags for context (tag_context_size)
+        probs = torch.sigmoid(initial_preds)                 # convert logits to probabilities
+        # Get indices of top `tag_context_size` tags for each sample
+        _, topk_indices = torch.topk(probs, k=self.tag_context_size, dim=1)
+        # 4. Embed selected tags
+        tag_embeds = self.tag_embedding(topk_indices)        # [B, tag_context_size, embedding_dim]
+        # 5. Self-attention on tag embeddings (to refine tag representation)
+        attn_tags = self.tag_attention(tag_embeds)           # [B, tag_context_size, embedding_dim]
+        attn_tags = self.tag_norm(attn_tags)                 # layer norm
+        # 6. Cross-attention between image feature and attended tags
+        # Expand image features to have one per tag position
+        feat_q = self.cross_proj(feats)                      # [B, embedding_dim]
+        # Repeat each image feature vector tag_context_size times to form a sequence
+        feat_q = feat_q.unsqueeze(1).expand(-1, self.tag_context_size, -1)  # [B, tag_context_size, embedding_dim]
+        # Use image features as queries, tag embeddings as keys and values
+        cross_attn = self.cross_attention(feat_q, attn_tags, attn_tags)  # [B, tag_context_size, embedding_dim]
+        cross_attn = self.cross_norm(cross_attn)
+        # 7. Fuse features: average the cross-attended tag outputs, and combine with original features
+        fused_feature = cross_attn.mean(dim=1)               # [B, embedding_dim]
+        combined = torch.cat([feats, fused_feature], dim=1)  # [B, embedding_dim*2]
+        # 8. Refined tag prediction
+        refined_logits = self.refined_classifier(combined)   # [B, total_tags]
+        refined_preds = torch.clamp(refined_logits / self.temperature, min=-15.0, max=15.0)
+        return initial_preds, refined_preds
+# --- Load the pretrained refined model weights ---
+total_tags = 70527  # total number of tags in the dataset (Danbooru 2024)
+from safetensors.torch import load_file
+safetensors_path = 'model_refined.safetensors'
+state_dict = load_file(safetensors_path, device='cpu')  # Load the saved weights (should be an OrderedDict)
+#state_dict = torch.load("model_refined.pt", map_location="cpu")  # Load the saved weights (should be an OrderedDict)
+# Initialize our model and load weights
+model = ImageTaggerRefinedONNX(total_tags=total_tags)
+model.load_state_dict(state_dict)
+model.eval()  # set to evaluation mode (disable dropout)
+# (Optional) Cast to float32 if weights were in half precision
+# model = model.float()
+# --- Export to ONNX ---
+dummy_input = torch.randn(1, 3, 512, 512, requires_grad=False)  # dummy batch of 1 image (3x512x512)
+output_onnx_file = "camie_refined_no_flash_v15.onnx"
+torch.onnx.export(
+    model, dummy_input, output_onnx_file,
+    export_params=True,        # store trained parameter weights inside the model file
+    opset_version=17,          # ONNX opset version (ensure support for needed ops)
+    do_constant_folding=True,  # optimize constant expressions
+    input_names=["image"],
+    output_names=["initial_tags", "refined_tags"],
+    dynamic_axes={             # set batch dimension to be dynamic
+        "image": {0: "batch"},
+        "initial_tags": {0: "batch"},
+        "refined_tags": {0: "batch"}
+    }
+)
+print(f"ONNX model exported to {output_onnx_file}")

thresholds.json ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+  "overall": {
+    "balanced": {
+      "threshold": 0.3285714089870453,
+      "f1": 0.6128875755303665,
+      "precision": 0.6348684210526315,
+      "recall": 0.5923778668258164
+    },
+    "high_precision": {
+      "threshold": 0.48367345333099365,
+      "f1": 0.5073781135639239,
+      "precision": 0.8244772683675426,
+      "recall": 0.3664421519311109
+    },
+    "high_recall": {
+      "threshold": 0.20612245798110962,
+      "f1": 0.5140483341286104,
+      "precision": 0.38317013976064945,
+      "recall": 0.7807144684116293
+    }
+  },
+  "weighted": {
+    "f1": {
+      "threshold": 0.31224489212036133,
+      "value": 0.666115043816508
+    }
+  },
+  "categories": {
+    "copyright": {
+      "balanced": {
+        "threshold": 0.3857142925262451,
+        "f1": 0.7885196374622356,
+        "precision": 0.903114186851211,
+        "recall": 0.6997319034852547
+      },
+      "high_precision": {
+        "threshold": 0.5,
+        "f1": 0.7524429967426711,
+        "precision": 0.9585062240663901,
+        "recall": 0.6193029490616622
+      },
+      "high_recall": {
+        "threshold": 0.13265305757522583,
+        "f1": 0.5149136577708007,
+        "precision": 0.36403995560488345,
+        "recall": 0.8793565683646113
+      }
+    },
+    "character": {
+      "balanced": {
+        "threshold": 0.30408161878585815,
+        "f1": 0.769028871391076,
+        "precision": 0.8878787878787879,
+        "recall": 0.6782407407407407
+      },
+      "high_precision": {
+        "threshold": 0.47551020979881287,
+        "f1": 0.7128129602356407,
+        "precision": 0.979757085020243,
+        "recall": 0.5601851851851852
+      },
+      "high_recall": {
+        "threshold": 0.13265305757522583,
+        "f1": 0.5132616487455197,
+        "precision": 0.37175493250259606,
+        "recall": 0.8287037037037037
+      }
+    },
+    "general": {
+      "balanced": {
+        "threshold": 0.3285714089870453,
+        "f1": 0.6070014256296532,
+        "precision": 0.6206003023105161,
+        "recall": 0.5939857393820399
+      },
+      "high_precision": {
+        "threshold": 0.47551020979881287,
+        "f1": 0.5074963046385584,
+        "precision": 0.7958057395143487,
+        "recall": 0.3725328097550894
+      },
+      "high_recall": {
+        "threshold": 0.20612245798110962,
+        "f1": 0.5094889521485699,
+        "precision": 0.3790529978316777,
+        "recall": 0.7767903275808619
+      }
+    },
+    "meta": {
+      "balanced": {
+        "threshold": 0.31224489212036133,
+        "f1": 0.5943152454780362,
+        "precision": 0.5948275862068966,
+        "recall": 0.5938037865748709
+      },
+      "high_precision": {
+        "threshold": 0.41020408272743225,
+        "f1": 0.5087924970691676,
+        "precision": 0.7977941176470589,
+        "recall": 0.37349397590361444
+      },
+      "high_recall": {
+        "threshold": 0.22244898974895477,
+        "f1": 0.5037433155080214,
+        "precision": 0.365399534522886,
+        "recall": 0.810671256454389
+      }
+    },
+    "rating": {
+      "balanced": {
+        "threshold": 0.34489795565605164,
+        "f1": 0.7964912280701754,
+        "precision": 0.7229299363057324,
+        "recall": 0.88671875
+      },
+      "high_precision": {
+        "threshold": 0.5,
+        "f1": 0.6966824644549763,
+        "precision": 0.8855421686746988,
+        "recall": 0.57421875
+      },
+      "high_recall": {
+        "threshold": 0.10000000149011612,
+        "f1": 0.6538952745849297,
+        "precision": 0.4857685009487666,
+        "recall": 1.0
+      }
+    },
+    "artist": {
+      "balanced": {
+        "threshold": 0.22244898974895477,
+        "f1": 0.5017921146953405,
+        "precision": 0.56,
+        "recall": 0.45454545454545453
+      },
+      "high_precision": {
+        "threshold": 0.22244898974895477,
+        "f1": 0.5017921146953405,
+        "precision": 0.56,
+        "recall": 0.45454545454545453
+      },
+      "high_recall": {
+        "threshold": 0.22244898974895477,
+        "f1": 0.5017921146953405,
+        "precision": 0.56,
+        "recall": 0.45454545454545453
+      }
+    },
+    "year": {
+      "balanced": {
+        "threshold": 0.2877551317214966,
+        "f1": 0.32867132867132864,
+        "precision": 0.2974683544303797,
+        "recall": 0.3671875
+      },
+      "high_precision": {
+        "threshold": 0,
+        "f1": 0,
+        "precision": 0,
+        "recall": 0
+      },
+      "high_recall": {
+        "threshold": 0,
+        "f1": 0,
+        "precision": 0,
+        "recall": 0
+      }
+    }
+  }
+}