Spaces:

Trd-Bobo242
/

chat_lot

Runtime error

App Files Files Community

Trd-Bobo242 commited on Sep 24

Commit

788b21a

verified ·

1 Parent(s): 53303c3

Create model.py

Browse files

Files changed (1) hide show

model.py +292 -0

model.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, PreTrainedTokenizer, AutoConfig
+from typing import Optional, Dict, Any
+import torch.nn.functional as F
+class SuperConfig(AutoConfig):
+    """
+    Configuration class for the Super model
+    """
+    model_type = "super"
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        **kwargs
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+class SuperAttention(nn.Module):
+    """
+    Multi-head attention module for Super model
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.n_embd = config.n_embd
+        self.n_head = config.n_head
+        self.head_size = self.n_embd // self.n_head
+        self.scale = self.head_size ** -0.5
+        self.c_attn = nn.Linear(self.n_embd, 3 * self.n_embd)
+        self.c_proj = nn.Linear(self.n_embd, self.n_embd)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, x, attention_mask=None):
+        B, T, C = x.size()
+        # Query, Key, Value projections
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        # Reshape for multi-head attention
+        q = q.view(B, T, self.n_head, self.head_size).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_size).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_size).transpose(1, 2)
+        # Attention scores
+        att = (q @ k.transpose(-2, -1)) * self.scale
+        # Apply attention mask if provided
+        if attention_mask is not None:
+            att = att.masked_fill(attention_mask == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_dropout(att)
+        # Weighted sum of values
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        # Output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class SuperMLP(nn.Module):
+    """
+    Feed-forward network for Super model
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.act(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class SuperBlock(nn.Module):
+    """
+    Transformer block for Super model
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = SuperAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.mlp = SuperMLP(config)
+    def forward(self, x, attention_mask=None):
+        x = x + self.attn(self.ln_1(x), attention_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class SuperModel(PreTrainedModel):
+    """
+    The Super model implementation
+    """
+    config_class = SuperConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wpe = nn.Embedding(1024, config.n_embd)  # positional embeddings
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([SuperBlock(config) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, input_ids, attention_mask=None, position_ids=None):
+        device = input_ids.device
+        b, t = input_ids.size()
+        if position_ids is None:
+            position_ids = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
+        # Token and position embeddings
+        tok_emb = self.wte(input_ids)
+        pos_emb = self.wpe(position_ids)
+        x = self.drop(tok_emb + pos_emb)
+        # Prepare attention mask
+        if attention_mask is not None:
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = attention_mask.to(dtype=torch.float32)
+            attention_mask = (1.0 - attention_mask) * torch.finfo(torch.float32).min
+        # Transformer blocks
+        for block in self.h:
+            x = block(x, attention_mask)
+        x = self.ln_f(x)
+        return x
+class SuperForCausalLM(PreTrainedModel):
+    """
+    Super model for causal language modeling
+    """
+    config_class = SuperConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = SuperModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Tie weights
+        self.lm_head.weight = self.transformer.wte.weight
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, input_ids, attention_mask=None, labels=None):
+        hidden_states = self.transformer(input_ids, attention_mask)
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        return {
+            'loss': loss,
+            'logits': lm_logits,
+            'hidden_states': hidden_states
+        }
+    def generate(self, input_ids, max_length=100, temperature=1.0, top_k=50, top_p=0.95):
+        """
+        Simple generation method
+        """
+        for _ in range(max_length - input_ids.size(1)):
+            with torch.no_grad():
+                outputs = self.forward(input_ids)
+                next_token_logits = outputs['logits'][:, -1, :] / temperature
+                # Apply top-k filtering
+                if top_k > 0:
+                    indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
+                    next_token_logits[indices_to_remove] = -float('Inf')
+                # Apply top-p filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    next_token_logits[indices_to_remove] = -float('Inf')
+                # Sample next token
+                probs = F.softmax(next_token_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                input_ids = torch.cat([input_ids, next_token], dim=1)
+        return input_ids
+# Example usage and model initialization
+def create_super_model(vocab_size=50257, n_embd=768, n_layer=12, n_head=12):
+    """
+    Helper function to create a Super model instance
+    """
+    config = SuperConfig(
+        vocab_size=vocab_size,
+        n_embd=n_embd,
+        n_layer=n_layer,
+        n_head=n_head
+    )
+    return SuperForCausalLM(config)
+if __name__ == "__main__":
+    # Example usage
+    model = create_super_model()
+    print(f"Super model created with {sum(p.numel() for p in model.parameters()):,} parameters")
+    # Test forward pass
+    input_ids = torch.tensor([[1, 2, 3, 4, 5]])
+    outputs = model(input_ids)
+    print(f"Output logits shape: {outputs['logits'].shape}")