Initial CosAE release

Browse files

Files changed (4) hide show

cosae/__init__.py +0 -0
cosae/config.py +57 -0
cosae/cosae.py +53 -0
cosae/modules.py +267 -0

cosae/__init__.py ADDED Viewed

File without changes

cosae/config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from transformers import PretrainedConfig, PreTrainedModel
+class CosAEConfig(PretrainedConfig):
+    model_type = "cosae"
+    def __init__(
+        self,
+        image_size: tuple[int, int] = (256, 256),
+        # Encoder parameters
+        in_channels: int = 3,
+        hidden_dims: list[int] = (64, 128, 256, 512),
+        num_res_blocks: int = 2,
+        downsample_strides: list[int] = (2, 2, 2, 2),
+        use_encoder_attention: bool = True,
+        encoder_attention_heads: int = 8,
+        encoder_attention_layers: int = 1,
+        bottleneck_channels: int = 256,
+        basis_size: int = 32,
+        norm_type: str = "gn",      # "gn" (GroupNorm) or "ln" (LayerNorm)
+        activation: str = "gelu",   # "gelu" or "silu"
+        # Decoder parameters
+        decoder_hidden_dim: int = 256,
+        decoder_upsample_strides: list[int] = (2,),  # e.g. (2,) for one 2× upsample
+        use_decoder_attention: bool = False,
+        decoder_attention_heads: int = 8,
+        decoder_attention_layers: int = 0,
+        **kwargs,
+    ):
+        """
+        Configuration for CosAEModel, including encoder, HCM, and decoder settings.
+        """
+        super().__init__(**kwargs)
+        # Encoder settings
+        self.in_channels = in_channels
+        self.hidden_dims = list(hidden_dims)
+        self.num_res_blocks = num_res_blocks
+        self.downsample_strides = list(downsample_strides)
+        self.use_encoder_attention = use_encoder_attention
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_attention_layers = encoder_attention_layers
+        self.bottleneck_channels = bottleneck_channels
+        self.basis_size = basis_size
+        self.norm_type = norm_type
+        self.activation = activation
+        self.image_size = image_size
+        # Decoder settings
+        self.decoder_hidden_dim = decoder_hidden_dim
+        self.decoder_upsample_strides = list(decoder_upsample_strides)
+        self.use_decoder_attention = use_decoder_attention
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_attention_layers = decoder_attention_layers

cosae/cosae.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from transformers import PreTrainedModel
+from .modules import *
+from .config import CosAEConfig
+class CosAEModel(PreTrainedModel):
+    config_class = CosAEConfig
+    base_model_prefix = "cosae"
+    def __init__(self, config: CosAEConfig):
+        super().__init__(config)
+        # 1) Encoder
+        self.encoder = CosAEEncoder(config)
+        # 2) Harmonic Construction Module
+        #    derive P = total downsampling factor from encoder strides
+        stem_ds = 2 * 2
+        P       = stem_ds * math.prod(config.downsample_strides)
+        #    basis size T = P // 2
+        T = P // 2
+        self.T = T
+        self.hcm = HarmonicConstructionModule(
+            bottleneck_channels=config.bottleneck_channels,
+            basis_size=config.basis_size
+        )
+        # 3) Decoder
+        self.decoder = CosAEDecoder(config)
+        # initialize weights, etc.
+        self.post_init()
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+          pixel_values: [B, C_in, H, W]  (C_in = 3 or 9 if using FFT)
+        Returns:
+          recon:        [B, 3, H, W]      reconstructed image
+        """
+        # Encode to get amplitudes & phases
+        bottleneck = self.encoder(pixel_values)          # [B, 2c, H', W']
+        amp, ph = torch.chunk(bottleneck, 2, dim=1)      # each [B, c, H', W']
+        # Build harmonics
+        harmonics = self.hcm(amp, ph)                    # [B, c, H, W]
+        # Decode to reconstruct
+        recon = self.decoder(harmonics)                  # [B, 3, H, W]
+        return recon

cosae/modules.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from .config import CosAEConfig
+"""This code has partially been generated by ChatGPT"""
+class ResBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, norm_type="gn", activation="gelu"):
+        super().__init__()
+        Norm = nn.GroupNorm if norm_type == "gn" else nn.LayerNorm
+        act = nn.GELU if activation == "gelu" else nn.SiLU
+        self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1, bias=False)
+        self.norm1 = Norm(8, out_ch) if norm_type == "gn" else Norm(out_ch)
+        self.act1 = act()
+        self.conv2 = nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1, bias=False)
+        self.norm2 = Norm(8, out_ch) if norm_type == "gn" else Norm(out_ch)
+        self.act2 = act()
+        if in_ch != out_ch:
+            self.skip = nn.Conv2d(in_ch, out_ch, kernel_size=1, bias=False)
+        else:
+            self.skip = nn.Identity()
+    def forward(self, x):
+        identity = self.skip(x)
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.act1(out)
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out = out + identity
+        return self.act2(out)
+class CosAEEncoder(PreTrainedModel):
+    config_class = CosAEConfig
+    base_model_prefix = "encoder"
+    def __init__(self, config: CosAEConfig):
+        super().__init__(config)
+        c = config
+        # Stem
+        self.stem = nn.Sequential(
+            nn.Conv2d(c.in_channels, c.hidden_dims[0], kernel_size=7, stride=2, padding=3, bias=False),
+            nn.GroupNorm(8, c.hidden_dims[0]) if c.norm_type == "gn" else nn.LayerNorm([c.hidden_dims[0], 128, 128]),
+            nn.GELU() if c.activation == "gelu" else nn.SiLU(),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+        )
+        # Downsampling stages
+        dims = c.hidden_dims
+        self.stages = nn.ModuleList()
+        in_ch = dims[0]
+        for i, out_ch in enumerate(dims[1:]):
+            blocks = []
+            for _ in range(c.num_res_blocks):
+                blocks.append(ResBlock(in_ch, out_ch, norm_type=c.norm_type, activation=c.activation))
+                in_ch = out_ch
+            # downsample conv
+            blocks.append(
+                nn.Sequential(
+                    nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=c.downsample_strides[i], padding=1, bias=False),
+                    nn.GroupNorm(8, out_ch) if c.norm_type == "gn" else nn.LayerNorm([out_ch, -1, -1]),
+                    nn.GELU() if c.activation == "gelu" else nn.SiLU(),
+                )
+            )
+            self.stages.append(nn.Sequential(*blocks))
+        # Optional global attention
+        if c.use_encoder_attention:
+            encoder_layer = nn.TransformerEncoderLayer(
+                d_model=dims[-1],
+                nhead=c.encoder_attention_heads,
+                dim_feedforward=dims[-1] * 4,
+                activation=c.activation,
+                batch_first=True,
+            )
+            self.attn = nn.TransformerEncoder(encoder_layer, num_layers=c.encoder_attention_layers)
+        else:
+            self.attn = None
+        # Head: project to 2 * bottleneck_channels
+        self.head = nn.Conv2d(dims[-1], 2 * c.bottleneck_channels, kernel_size=1)
+        # Initialize weights
+        self.post_init()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [B, C_in, H, W]
+        Returns:
+            bottleneck: [B, 2c, H/P, W/P]
+        """
+        # Stem
+        x = self.stem(x)
+        # Stages
+        for stage in self.stages:
+            x = stage(x)
+        # x: [B, dims[-1], H/P, W/P]
+        # Optional attention
+        if self.attn is not None:
+            B, C, H, W = x.shape
+            seq = x.flatten(2).transpose(1, 2)  # [B, H*W, C]
+            seq = self.attn(seq)
+            x = seq.transpose(1, 2).view(B, C, H, W)
+        # Head
+        bottleneck = self.head(x)
+        return bottleneck
+class HarmonicConstructionModule(nn.Module):
+    """
+    Given:
+      - amplitudes: Tensor of shape [B, c, H', W']
+      - phases:     Tensor of shape [B, c, H', W']
+    and learnable frequencies (u, v) of shape [c, 2],
+    this module builds a [B, c, H'*T, W'*T] tensor of harmonics:
+       H[b,k,i*T + x, j*T + y]
+         = A[b,k,i,j] * cos( 2π/T * (u[k]*x + v[k]*y) - Φ[b,k,i,j] )
+    """
+    def __init__(self, bottleneck_channels: int, basis_size: int):
+        """
+        Args:
+          bottleneck_channels: c, number of freq components
+          basis_size:          T, size of each cosine basis (e.g. 32 or 64)
+        """
+        super().__init__()
+        self.c = bottleneck_channels
+        self.T = basis_size
+        # Learnable frequencies in [0, T/2)
+        self.freqs = nn.Parameter(
+            torch.rand(self.c, 2) * (self.T / 2)
+        )  # shape [c,2] for (u,v)
+        # Precompute the x,y grid of size [T, T]
+        x = torch.arange(self.T, dtype=torch.float32)
+        y = torch.arange(self.T, dtype=torch.float32)
+        xs, ys = torch.meshgrid(x, y, indexing="ij")  # both shape [T,T]
+        # Register as buffers so they move with .to(device)
+        self.register_buffer("xs", xs)  # [T,T]
+        self.register_buffer("ys", ys)  # [T,T]
+    def forward(self, amplitude: torch.Tensor, phase: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+          amplitude: [B, c, H', W']
+          phase:     [B, c, H', W']
+        Returns:
+          harmonics: [B, c, H'*T, W'*T]
+        """
+        B, c, Hp, Wp = amplitude.shape
+        assert c == self.c, "Channel mismatch"
+        # 1) compute spatial_phase for each freq: [c, T, T]
+        #    2π/T * (u[k]*xs + v[k]*ys)
+        u = self.freqs[:, 0].view(c, 1, 1)  # [c,1,1]
+        v = self.freqs[:, 1].view(c, 1, 1)  # [c,1,1]
+        spatial_phase = (2 * math.pi / self.T) * (u * self.xs + v * self.ys)
+        # reshape for broadcasting to [1,c,1,1,T,T]
+        spatial_phase = spatial_phase.view(1, c, 1, 1, self.T, self.T)
+        # 2) prepare amplitude & phase maps:
+        #    [B, c, Hp, Wp] → [B, c, Hp, Wp, 1, 1]
+        A = amplitude.view(B, c, Hp, Wp, 1, 1)
+        Φ = phase.view(B, c, Hp, Wp, 1, 1)
+        # 3) compute argument and harmonic:
+        #    arg = spatial_phase - Φ
+        #    H = A * cos(arg)
+        arg = spatial_phase - Φ  # [B, c, Hp, Wp, T, T]
+        H = A * torch.cos(arg)   # same shape
+        # 4) tile out to full spatial size [B, c, Hp*T, Wp*T]
+        #    first permute to [B, c, Hp, T, Wp, T] then reshape
+        H = H.permute(0, 1, 2, 4, 3, 5)  # [B, c, Hp, T, Wp, T]
+        H = H.reshape(B, c, Hp * self.T, Wp * self.T)
+        return H
+class CosAEDecoder(PreTrainedModel):
+    config_class = CosAEConfig
+    base_model_prefix = "decoder"
+    def __init__(self, config: CosAEConfig):
+        super().__init__(config)
+        c = config
+        # 1×1 projection from HCM channels → decoder hidden dim
+        self.proj = nn.Conv2d(
+            c.bottleneck_channels,
+            c.decoder_hidden_dim,
+            kernel_size=1,
+            bias=False
+        )
+        # normalization + activation after proj
+        Norm = nn.GroupNorm if c.norm_type == "gn" else nn.LayerNorm
+        self.norm0 = Norm(8, c.decoder_hidden_dim) if c.norm_type=="gn" else Norm([c.decoder_hidden_dim, -1, -1])
+        self.act0  = nn.GELU() if c.activation=="gelu" else nn.SiLU()
+        # upsampling blocks
+        self.upsamples = nn.ModuleList()
+        for scale in c.decoder_upsample_strides:
+            block = nn.Sequential(
+                nn.Upsample(scale_factor=scale, mode="bilinear", align_corners=False),
+                nn.Conv2d(c.decoder_hidden_dim, c.decoder_hidden_dim, kernel_size=3, padding=1, bias=False),
+                Norm(8, c.decoder_hidden_dim) if c.norm_type=="gn" else Norm([c.decoder_hidden_dim, -1, -1]),
+                nn.GELU() if c.activation=="gelu" else nn.SiLU(),
+            )
+            self.upsamples.append(block)
+        # optional global attention in decoder
+        if c.use_decoder_attention:
+            enc_layer = nn.TransformerEncoderLayer(
+                d_model=c.decoder_hidden_dim,
+                nhead=c.decoder_attention_heads,
+                dim_feedforward=c.decoder_hidden_dim * 4,
+                activation=c.activation,
+                batch_first=True,
+            )
+            self.attn = nn.TransformerEncoder(enc_layer, num_layers=c.decoder_attention_layers)
+        else:
+            self.attn = None
+        # final conv to RGB
+        self.final_conv = nn.Conv2d(
+            c.decoder_hidden_dim,
+            3,
+            kernel_size=3,
+            padding=1
+        )
+        # initialize weights
+        self.post_init()
+    def forward(self, harmonics: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+          harmonics: Tensor from HCM, shape [B, c, H*, W*]
+        Returns:
+          recon:     Reconstructed image, shape [B, 3, H, W]
+        """
+        x = self.proj(harmonics)         # [B, hidden_dim, H*, W*]
+        x = self.norm0(x)
+        x = self.act0(x)
+        # upsample to higher resolution
+        for up in self.upsamples:
+            x = up(x)                   # doubles H*, W* each block
+        # optional global attention
+        if self.attn is not None:
+            B, C, H, W = x.shape
+            seq = x.flatten(2).transpose(1, 2)  # [B, H*W, C]
+            seq = self.attn(seq)
+            x = seq.transpose(1, 2).view(B, C, H, W)
+        # final RGB projection
+        recon = self.final_conv(x)
+        return recon