core-dino src added

Browse files

Files changed (5) hide show

src/backbone.py +79 -0
src/data.py +158 -0
src/loss.py +144 -0
src/train.py +112 -0
src/utils.py +77 -0

src/backbone.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+🦴 core-dino | YOLO Backbone Wrapper for Feature Extraction 🔍
+Wraps a YOLO model to extract intermediate feature maps for DINO-style
+self-supervised training. Optionally applies an MLP projection head.
+Author: Gajesh Ladhar
+🔗 LinkedIn: https://www.linkedin.com/in/gajeshladhar/
+🤗 Hugging Face: https://huggingface.co/gajeshladhar
+"""
+import torch
+import torch.nn as nn
+from ultralytics import YOLO
+class YOLOBackBone(nn.Module):
+    """
+    🧩 Extracts multi-scale spatial features from YOLO backbone.
+    Args:
+        model_path (str): Path to YOLO weights (.pt)
+        stop_at (int): Layer index to cut the model
+        use_mlp (bool): Whether to apply MLP projection head
+        mlp_dim (int): Output dim of MLP head (if enabled)
+    """
+    def __init__(self, model_path='yolo11x.pt', stop_at=23, use_mlp=True, mlp_dim=512):
+        super().__init__()
+        raw_model = YOLO(model_path).model.train()
+        self.layers = nn.ModuleList(raw_model.model[:stop_at])
+        self.layer_defs = raw_model.yaml["backbone"] + raw_model.yaml["head"]
+        self.use_mlp = use_mlp
+        if use_mlp:
+            self.init_mlp(self._get_out_channels(self.layers[-1]), mlp_dim)
+        for p in self.parameters():
+            p.requires_grad = True
+    def _get_out_channels(self, layer):
+        return 768
+    def init_mlp(self, in_channels, out_channels):
+        self.mlp_head = nn.Identity()
+        # self.mlp_head = nn.Sequential(
+        #     nn.Conv2d(in_channels, 2048, 1),
+        #     nn.GELU(),
+        #     nn.Conv2d(2048, out_channels, 1),
+        #     nn.GELU(),
+        #     nn.Conv2d(out_channels, in_channels, 1)
+        # )
+    def apply_mlp(self, x):
+        return self.mlp_head(x) if self.use_mlp else x
+    def forward(self, x):
+        """
+        🚀 Forward pass through selected YOLO layers and optional MLP.
+        Args:
+            x (Tensor): Input image tensor (B, C, H, W)
+        Returns:
+            Tensor: Final feature map
+        """
+        outputs = []
+        for i, layer in enumerate(self.layers):
+            from_ids = self.layer_defs[i][0]
+            from_ids = [from_ids] if isinstance(from_ids, int) else from_ids
+            inputs = [x if j == -1 else outputs[j] for j in from_ids]
+            x = layer(inputs if len(inputs) > 1 else inputs[0])
+            outputs.append(x)
+        return self.apply_mlp(x)
+    def count_params(self):
+        total = sum(p.numel() for p in self.parameters())
+        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return total, trainable

src/data.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""📦 core-dino | Data Loader for Self-Supervised DINO Training on Core-Five 🚀
+This module defines the `DinoDataset` which streams multi-resolution
+satellite patches from the Core-Five dataset, preparing teacher-student
+views for resolution-agnostic self-supervised learning.
+"""
+import os
+import io
+import time
+import torch
+import random
+import requests
+import numpy as np
+import geopandas as gpd
+import h5py
+import xarray as xr
+from torch import nn
+from torch.utils.data import Dataset
+import albumentations as A
+import fsspec
+from utils import (
+    shared_store, process_pool, write_last_updated,
+    AddPoissonNoise, AddSaltPepperNoise
+)
+class DinoDataset(Dataset):
+    """
+    🧠 DinoDataset — resolution-agnostic loader for Core-Five 🌍
+    Streams random crops of HR satellite images from Hugging Face,
+    creates clean (teacher) and augmented (student) views using
+    Albumentations & torch.
+     ---
+    👤 Author: Gajesh Ladhar
+    🔗 LinkedIn: 🔗 https://www.linkedin.com/in/gajeshladhar/
+    🤗 Hugging Face: 🤗 https://huggingface.co/gajeshladhar
+    """
+    def __init__(self, imgsz, batch_size=1, queue_size=50):
+        """
+        📐 Init the dataset with remote Core-Five metadata and start
+        async patch fetching.
+        Args:
+            imgsz (int): Patch size (min 320 recommended)
+            batch_size (int): Number of patches per batch
+            queue_size (int): Max queue length for shared store
+        """
+        if imgsz < 320:
+            raise ValueError("❗️imgsz must be ≥ 320 for stable patch extraction — got {}".format(imgsz))
+        self.imgsz = imgsz
+        metadata_url = "https://huggingface.co/datasets/gajeshladhar/core-five/resolve/main/metadata.parquet"
+        self.df_metadata = gpd.read_parquet(fsspec.open(metadata_url).open())
+        self.batch_size = batch_size
+        self.queue_size = queue_size
+        self.store = shared_store
+        for _ in range(6):
+            process_pool.submit(self.fetch_and_store)
+    @staticmethod
+    def transform(batch):
+        """
+        🎛️ Apply augmentation pipeline to simulate degraded inputs
+        for student; teacher gets clean view. Maintains shape consistency.
+        Returns:
+            Dict with 'student' and 'teacher' uint8 tensors
+        """
+        augment_satellite = A.Compose([
+            A.GaussNoise(std_range=(0.01, 0.1), p=0.3),
+            AddPoissonNoise(p=0.3),
+            AddSaltPepperNoise(amount=0.02, p=0.3),
+            A.MultiplicativeNoise(multiplier=(0.9, 1.1), elementwise=True, p=0.3),
+            A.MotionBlur(blur_limit=(3, 11), p=0.3),
+            A.GaussianBlur(blur_limit=(3, 11), p=0.3),
+            A.ISONoise(color_shift=(0.01, 0.05), intensity=(0.1, 0.3), p=0.1),
+            A.RandomBrightnessContrast(brightness_limit=0.5, contrast_limit=0.5, p=0.3),
+            A.RGBShift(r_shift_limit=30, g_shift_limit=30, b_shift_limit=30, p=0.3),
+            A.HueSaturationValue(hue_shift_limit=30, sat_shift_limit=30, val_shift_limit=30, p=0.3),
+            A.CLAHE(clip_limit=2.0, tile_grid_size=(8, 8), p=0.2),
+            A.CoarseDropout(num_holes_range=(1, 4), hole_height_range=(0.05, 0.2),
+                            hole_width_range=(0.05, 0.2), fill='random_uniform', p=0.1)
+        ])
+        imgsz_half = batch[0].shape[-1]
+        size = np.random.choice(np.arange(32 * 10, imgsz_half, 32))
+        student, teacher = [], []
+        for img in batch:
+            student_data = nn.Upsample(size=size, mode='bilinear')(torch.tensor(img[np.newaxis, :]))[0].data.numpy().astype("uint8")
+            student_data = augment_satellite(image=student_data.transpose(1, 2, 0))['image'].transpose(2, 0, 1)
+            student.append(torch.tensor(student_data))
+            teacher.append(torch.tensor(img))
+        return {
+            "student": torch.stack(student).to(torch.uint8),
+            "teacher": torch.stack(teacher).to(torch.uint8)
+        }
+    def fetch_and_store(self):
+        """
+        🔄 Continuously samples random crops from Core-Five, augments
+        them via `transform`, and updates the shared queue for training.
+        """
+        np.random.seed(int.from_bytes(os.urandom(4), 'little'))
+        while True:
+            try:
+                batch = []
+                for _ in range(self.batch_size):
+                    path = os.path.join("https://huggingface.co/datasets/gajeshladhar/core-five/resolve/main/",
+                                        self.df_metadata.sample(n=1).path.iloc[0])
+                    buffer = io.BytesIO(requests.get(path, headers={"User-Agent": "Mozilla/5.0"}).content)
+                    with h5py.File(buffer, "r") as f:
+                        x = f["hr/x"][:]
+                        y = f["hr/y"][:]
+                        data = f["/hr/data"][:]
+                        bands = list(range(data.shape[0]))
+                    ds = xr.DataArray(data, dims=['band', 'y', 'x'], coords=[bands, y, x]).astype("uint8")
+                    imgsz_half = self.imgsz // 2
+                    yid = np.random.randint(imgsz_half, len(ds.y) - imgsz_half)
+                    xid = np.random.randint(imgsz_half, len(ds.x) - imgsz_half)
+                    ds = ds.isel(y=range(yid - imgsz_half, yid + imgsz_half),
+                                 x=range(xid - imgsz_half, xid + imgsz_half)).compute()
+                    ds['y'], ds['x'] = np.linspace(ds.y.values[0], ds.y.values[-1], ds.shape[1]), \
+                                       np.linspace(ds.x.values[0], ds.x.values[-1], ds.shape[2])
+                    batch.append(ds.data)
+                result = DinoDataset.transform(batch)
+                if len(self.store) >= self.queue_size:
+                    index = np.random.randint(0, self.queue_size - 1)
+                    self.store[index] = result
+                else:
+                    self.store.append(result)
+                # enable for getting recent updates
+                if np.random.random() < 0.20:
+                    write_last_updated()
+            except KeyboardInterrupt:
+                break
+            except Exception as e:
+                print("ERROR:", e)
+                continue
+if __name__=="__main__":
+    dataset = DinoDataset(imgsz=1696,batch_size=3,queue_size=1000)
+    while True :
+        print(len(dataset.store))
+        time.sleep(5)

src/loss.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+🎯 core-dino | DINO-style Loss Functions 💥
+Defines the cross-view contrastive loss used in DINO setups,
+including temperature scaling, centering, and teacher-student divergence.
+Includes:
+- DinoSpatialLoss: Temp-scaled CE loss with center momentum 🌀
+- DinoSinkhornSpatialLoss: Sinkhorn-based balanced assignment loss ⚖️
+Author: Gajesh Ladhar
+🔗 LinkedIn: https://www.linkedin.com/in/gajeshladhar/
+🤗 Hugging Face: https://huggingface.co/gajeshladhar
+"""
+import torch
+from torch import nn
+import torch.nn.functional as F
+class DinoSpatialLoss(nn.Module):
+    """
+    🌀 DINO loss using temperature-scaled cross-entropy over spatial tokens.
+    - Aligns teacher & student spatial features (B, C, H, W)
+    - Applies center momentum for teacher stability
+    Args:
+        teacher_temp (float): Temperature for teacher softmax
+        student_temp (float): Temperature for student softmax
+        center_momentum (float): EMA factor for center update
+    """
+    def __init__(self, teacher_temp=0.04, student_temp=0.1, center_momentum=0.9):
+        super().__init__()
+        self.teacher_temp = teacher_temp
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, 1))  # lazy init
+    def forward(self, student_feat, teacher_feat):
+        """
+        Compute loss over (B, C, H, W) features.
+        Args:
+            student_feat (Tensor): Student output, shape (B, C, Hs, Ws)
+            teacher_feat (Tensor): Teacher output, shape (B, C, Ht, Wt)
+        Returns:
+            Tensor: Scalar DINO loss
+        """
+        # Initialize center shape based on teacher feature dim
+        if self.center.shape[1] == 1:
+            self.center = self.center.new_zeros(1, teacher_feat.shape[1])
+        # Resize student to teacher resolution
+        student_resized = F.interpolate(student_feat, size=teacher_feat.shape[2:], mode='bilinear', align_corners=False)
+        # Flatten spatial dims: (B, C, H, W) → (B*H*W, C)
+        B, C, H, W = student_resized.shape
+        student_flat = student_resized.permute(0, 2, 3, 1).reshape(-1, C)  # (BHW, C)
+        teacher_flat = teacher_feat.permute(0, 2, 3, 1).reshape(-1, C)     # (BHW, C)
+        # Apply softmax (teacher uses center)
+        student_logits = student_flat / self.student_temp
+        teacher_logits = (teacher_flat - self.center) / self.teacher_temp
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_probs = F.softmax(teacher_logits, dim=-1).detach()
+        # Cross-entropy loss
+        loss = - (teacher_probs * student_log_probs).sum(dim=-1).mean()
+        # Update center
+        batch_center = teacher_probs.mean(dim=0, keepdim=True)
+        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)
+        return loss
+class SinkhornKnopp(nn.Module):
+    """
+    ⚖️ Sinkhorn-Knopp normalization for balanced assignments.
+    Args:
+        num_iters (int): Number of normalization iterations
+        eps (float): Stabilizer to avoid div-by-zero
+    """
+    def __init__(self, num_iters: int = 3, eps: float = 1e-6):
+        super().__init__()
+        self.num_iters = num_iters
+        self.eps = eps
+    def forward(self, logits: torch.Tensor) -> torch.Tensor:
+        logits = logits - logits.max(dim=1, keepdim=True)[0]  # stabilize
+        Q = torch.exp(logits).clone()
+        Q /= Q.sum()
+        for _ in range(self.num_iters):
+            Q /= Q.sum(dim=1, keepdim=True) + self.eps  # row normalization
+            Q /= Q.sum(dim=0, keepdim=True) + self.eps  # column normalization
+        return Q
+class DinoSinkhornSpatialLoss(nn.Module):
+    """
+    🌀 DINO loss with Sinkhorn assignment — no center, balanced targets.
+    Args:
+        student_temp (float): Temperature for student softmax
+        sinkhorn_iters (int): Iterations for Sinkhorn normalization
+    """
+    def __init__(self, student_temp=0.1, sinkhorn_iters=3):
+        super().__init__()
+        self.student_temp = student_temp
+        self.sinkhorn = SinkhornKnopp(sinkhorn_iters)
+    def forward(self, student_feat, teacher_feat):
+        """
+        student_feat: (B, C, Hs, Ws)
+        teacher_feat: (B, C, Ht, Wt)
+        """
+        # Resize student to teacher resolution
+        student_resized = F.interpolate(
+            student_feat, size=teacher_feat.shape[2:], mode='bilinear', align_corners=False
+        )
+        # Flatten spatial dims: (B, C, H, W) → (BHW, C)
+        B, C, H, W = student_resized.shape
+        student_flat = student_resized.permute(0, 2, 3, 1).reshape(-1, C)
+        teacher_flat = teacher_feat.permute(0, 2, 3, 1).reshape(-1, C)
+        # Teacher: apply Sinkhorn (no temp, no center)
+        teacher_probs = self.sinkhorn(teacher_flat).detach()
+        # Student: softmax with temp
+        student_log_probs = F.log_softmax(student_flat / self.student_temp, dim=-1)
+        # Cross-entropy loss
+        loss = -(teacher_probs * student_log_probs).sum(dim=-1).mean()
+        return loss

src/train.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+🛰️ core-dino | Training Script for Resolution-Agnostic SSL on Satellite Imagery
+Trains DINO with a YOLO backbone using multi-resolution Core-Five patches.
+👨‍💻 Author: Gajesh Ladhar
+🔗 LinkedIn: https://www.linkedin.com/in/gajeshladhar/
+🤗 Hugging Face: https://huggingface.co/gajeshladhar
+"""
+# 📦 Imports
+import torch
+from torch.utils.data import DataLoader
+from loss import DinoSpatialLoss
+from backbone import YOLOBackBone
+from data import DinoDataset
+from utils import *
+# ⚙️ Config
+CFG = {
+    "imgsz": 1696,
+    "batch_size": 4,
+    "epochs": 100,
+    "device": "cuda" if torch.cuda.is_available() else "cpu",
+    "lr": 1e-4,
+    "queue_size": 1000,
+    "ckpt_path": "yolo11x.pt",
+    "save_path" : "dino-yolo.pt",
+    ## core-DINO logic parameters...
+    "teacher_temperature":0.04,
+    "student_temperature":0.1,
+    "teacher_ema" : 0.998,
+}
+# 🔄 Sync Student → Teacher Weights
+@torch.no_grad()
+def initialize_teacher(student, teacher):
+    for ps, pt in zip(student.parameters(), teacher.parameters()):
+        pt.data.copy_(ps.data)
+@torch.no_grad()
+def update_teacher(student, teacher, m=0.996):
+    for ps, pt in zip(student.parameters(), teacher.parameters()):
+        pt.data.mul_(m).add_(ps.data, alpha=1 - m)
+# 🧠 Model + Loss + Optimizer
+def setup_model_and_loss():
+    student = YOLOBackBone(model_path=CFG["ckpt_path"]).to(CFG["device"])
+    teacher = YOLOBackBone(model_path=CFG["ckpt_path"]).to(CFG["device"])
+    for p in teacher.parameters():
+        p.requires_grad = False
+    loss_fn = DinoSpatialLoss(teacher_temp=CFG["teacher_temperature"],student_temp=CFG["student_temperature"]).to(CFG["device"])
+    optimizer = torch.optim.AdamW(student.parameters(), lr=CFG["lr"], weight_decay=0.05)
+    return student, teacher, loss_fn, optimizer
+# 🔁 Training Loop
+def train():
+    student, teacher, criterion, optimizer = setup_model_and_loss()
+    dataset = DinoDataset(imgsz=CFG["imgsz"], batch_size=CFG["batch_size"], queue_size=CFG["queue_size"])
+    num_epochs = CFG["epochs"]
+    device = CFG["device"]
+    for epoch in range(num_epochs):
+        running_loss = 0.0
+        running_entropy = 0.0
+        total_count = 0
+        loop = tqdm(dataset.store, desc=f"📅 Epoch {epoch+1}/{num_epochs}")
+        for batch in loop:
+            images_s = torch.nan_to_num(batch['student'].float() / 255.0, nan=0.0).to(device)
+            images_t = torch.nan_to_num(batch['teacher'].float() / 255.0, nan=0.0).to(device)
+            with torch.no_grad():
+                teacher_out = teacher(images_t).detach()
+            with autocast(device_type='cuda', enabled=False):
+                student_out = student(images_s)
+                loss = criterion(student_out, teacher_out)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            update_teacher(student, teacher, m=CFG["teacher_ema"])
+            running_loss += loss.item()
+            total_count += 1
+            # 📊 Entropy Calc
+            probs = F.softmax(teacher_out / CFG["teacher_temperature"], dim=1)
+            eps = 1e-6
+            entropy = -(probs * (probs + eps).log()).sum(dim=1).mean()
+            running_entropy += entropy.item()
+            # 🔄 Live Bar Update
+            loop.set_postfix({
+                "💥 Loss": f"{loss.item():.4f}",
+                "📈 Entropy": f"{entropy.item():.4f}"
+            })
+        avg_loss = running_loss / total_count
+        avg_entropy = running_entropy / total_count
+        print(f"✅ Epoch {epoch+1:03} | 🧠 Avg Loss: {avg_loss:.4f} | 🔐 Teacher Entropy: {avg_entropy:.4f} | 💾 Saved → {CFG['save_path']}")
+        torch.save(student.state_dict(), CFG["save_path"])
+if __name__=="__main__":
+    train()

src/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import io
+import os
+import torch
+from torch import nn
+from torch.amp import autocast
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+import copy
+import queue
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+import fsspec
+import xarray as xr
+from tqdm.notebook import tqdm
+from ultralytics import YOLO
+from IPython.display import clear_output
+from multiprocessing import Manager
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+import huggingface_hub as hf
+import albumentations as A
+import h5py
+import requests
+from io import BytesIO
+import datetime
+from pathlib import Path
+import tempfile
+import shutil
+# parallel processing of static datasets
+manager = Manager()
+shared_store = manager.list()
+process_pool = ProcessPoolExecutor(max_workers=6)
+def write_last_updated(path="store_last_updated.txt"):
+    with tempfile.NamedTemporaryFile("w", delete=False, dir=".") as tmp:
+        tmp.write(f"{datetime.datetime.now().isoformat()}")
+        tmp_path = tmp.name
+    shutil.move(tmp_path, path)
+class AddPoissonNoise(A.ImageOnlyTransform):
+    def __init__(self, p=0.5):
+        super().__init__(p)
+    def apply(self, image, **params):
+        image = image.astype(np.float32) / 255.0 if image.dtype == np.uint8 else image.copy()
+        noisy = np.random.poisson(image * 255.0)
+        return np.clip(noisy, 0, 255).astype('uint8')
+class AddSaltPepperNoise(A.ImageOnlyTransform):
+    def __init__(self, amount=0.02, salt_vs_pepper=0.5, p=0.5):
+        super(AddSaltPepperNoise, self).__init__(p)
+        self.amount = amount
+        self.salt_vs_pepper = salt_vs_pepper
+    def apply(self, image, **params):
+        noisy = image.copy()
+        num_salt = np.ceil(self.amount * image.size * self.salt_vs_pepper)
+        num_pepper = np.ceil(self.amount * image.size * (1.0 - self.salt_vs_pepper))
+        # Salt noise
+        coords = [np.random.randint(0, i - 1, int(num_salt)) for i in image.shape]
+        noisy[tuple(coords)] = 1
+        # Pepper noise
+        coords = [np.random.randint(0, i - 1, int(num_pepper)) for i in image.shape]
+        noisy[tuple(coords)] = 0
+        return noisy