Spaces:

hashvibe007
/

smollm2

Running

App Files Files Community

Vibi007 commited on 22 days ago

Commit

fceb8da

1 Parent(s): 035761e

Updated inference

Browse files

Files changed (13) hide show

.gitattributes +2 -0
.gitignore +2 -2
README.md +2 -0
final_model/config.json +29 -0
final_model/generation_config.json +6 -0
final_model/merges.txt +0 -0
final_model/model.safetensors +3 -0
final_model/special_tokens_map.json +43 -0
final_model/tokenizer.json +0 -0
final_model/tokenizer_config.json +170 -0
final_model/vocab.json +0 -0
inference.py +81 -57
model.py +170 -145

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2	+ *.ckpt filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -30,7 +30,7 @@ env.bak/
 venv.bak/
 # Training artifacts
-checkpoints/
 runs/
 logs/
 *.ckpt
@@ -38,7 +38,7 @@ logs/
 *.pth
 wandb/
 lightning_logs/
-final_model/
 # IDE
 .idea/

 venv.bak/
 # Training artifacts
+# checkpoints/
 runs/
 logs/
 *.ckpt
 *.pth
 wandb/
 lightning_logs/
+# final_model/
 # IDE
 .idea/

README.md CHANGED Viewed

@@ -15,6 +15,8 @@ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
 ```
 use config from https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config_smollm2_135M.yaml
 create model from above parameters
 Use it for training using pytorch lightning

 ```
 use config from https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config_smollm2_135M.yaml
+https://github.com/huggingface/smollm/blob/main/pre-training/smollm2/config_smollm2_135M.yaml
 create model from above parameters
 Use it for training using pytorch lightning

final_model/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 576,
+  "initializer_range": 0.041666666666666664,
+  "intermediate_size": 1536,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 3,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 49152
+}

final_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.47.0"
+}

final_model/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

final_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af74ecbdc60bc04c52a8dc7b42a79eecadadcbca7c300cc21e56583ab1c1e0b4
+size 651336704

final_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

final_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

final_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

final_model/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

inference.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import os
 import gradio as gr
 import torch
-from model import SmolLMModule, create_model_config
-from transformers import AutoTokenizer
 import yaml
 import glob
@@ -15,71 +15,94 @@ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
 tokenizer.pad_token = tokenizer.eos_token
-def load_model_from_checkpoint(checkpoint_path):
-    """Load model from checkpoint"""
-    model = SmolLMModule.load_from_checkpoint(checkpoint_path, config=config)
-    model.eval()  # Set to evaluation mode
-    return model
 def get_available_checkpoints():
-    """Get list of available checkpoints sorted by step number"""
-    checkpoints = glob.glob("checkpoints/*.ckpt")
-    if not checkpoints:
-        return [], []
-    # Sort by step number
-    def get_step_number(filepath):
         try:
             # Extract step number from the filename
-            filename = os.path.basename(filepath)
-            # Remove .ckpt extension
-            filename = filename.replace(".ckpt", "")
-            # Get the step number
-            if "step=" in filename:
-                return int(filename.split("step=")[1])
-            elif "-step-" in filename:
-                return int(filename.split("-step-")[1])
-            else:
-                return int("".join(filter(str.isdigit, filename)))
-        except (ValueError, IndexError):
             return 0
-    # Sort checkpoints by step number
-    checkpoints.sort(key=get_step_number)
-    # Create display names
-    display_names = [f"Step {get_step_number(x)}" for x in checkpoints]
-    return display_names, checkpoints
-def generate_text(
-    prompt, checkpoint_choice, max_length=100, temperature=0.7, top_p=0.9
-):
-    """Generate text based on prompt using selected checkpoint"""
-    # Check if checkpoint is selected
-    if not checkpoint_choice:
-        return "Please select a checkpoint first!"
     if not prompt:
         return "Please enter a prompt!"
     try:
-        # Get actual checkpoint path
-        step_num = int("".join(filter(str.isdigit, checkpoint_choice)))
-        checkpoints = glob.glob("checkpoints/*.ckpt")
-        checkpoint_path = None
-        for ckpt in checkpoints:
-            if str(step_num) in ckpt:
-                checkpoint_path = ckpt
-                break
-        if not checkpoint_path or not os.path.exists(checkpoint_path):
-            return f"Checkpoint for step {step_num} not found!"
-        # Load model from checkpoint
-        model = load_model_from_checkpoint(checkpoint_path)
         # Move model to GPU if available
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -108,7 +131,7 @@ def generate_text(
         return f"Error during generation: {str(e)}"
-# Get available checkpoints
 display_names, _ = get_available_checkpoints()
 # Create Gradio interface
@@ -116,17 +139,18 @@ with gr.Blocks(title="SmolLM2 Inference") as demo:
     gr.Markdown("# SmolLM2 Text Generation")
     if not display_names:
-        gr.Markdown("⚠️ No checkpoints found! Please train the model first.")
     else:
         gr.Markdown(
-            f"Found {len(display_names)} checkpoints. Select one and enter a prompt to generate text."
         )
     with gr.Row():
         with gr.Column():
-            checkpoint_dropdown = gr.Dropdown(
                 choices=display_names,
-                label="Select Checkpoint",
                 value=display_names[-1] if display_names else None,
                 interactive=True,
             )
@@ -149,7 +173,7 @@ with gr.Blocks(title="SmolLM2 Inference") as demo:
     generate_btn.click(
         fn=generate_text,
-        inputs=[prompt, checkpoint_dropdown, max_length, temperature, top_p],
         outputs=output,
     )

 import os
 import gradio as gr
 import torch
+from model import SmolLMModule
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import yaml
 import glob
 tokenizer.pad_token = tokenizer.eos_token
 def get_available_checkpoints():
+    """Get list of available checkpoints and final model"""
+    models = []
+    model_paths = {}
+    # Get checkpoints
+    checkpoints = glob.glob("checkpoints/*.ckpt")
+    for ckpt in checkpoints:
         try:
             # Extract step number from the filename
+            filename = os.path.basename(ckpt)
+            # Handle the format 'model-step=step=X.ckpt'
+            if "step=step=" in filename:
+                step = int(filename.split("step=step=")[1].split(".")[0])
+                display_name = f"Checkpoint Step {step}"
+                models.append(display_name)
+                model_paths[display_name] = ckpt
+        except (ValueError, IndexError) as e:
+            print(
+                f"Warning: Could not parse checkpoint filename: {filename}, Error: {e}"
+            )
+            continue
+    # Add final model if it exists
+    final_model_path = "final_model"
+    if os.path.exists(final_model_path):
+        display_name = "Final Model"
+        models.append(display_name)
+        model_paths[display_name] = final_model_path
+    # Sort checkpoints by step number (Final model will be at the end)
+    def get_step_number(name):
+        if name == "Final Model":
+            return float("inf")
+        try:
+            return int(name.split("Step ")[-1])
+        except:
             return 0
+    models.sort(key=get_step_number)
+    if not models:
+        print(
+            "Warning: No checkpoints or final model found in the following locations:"
+        )
+        print("- Checkpoints directory:", os.path.abspath("checkpoints"))
+        print("- Final model directory:", os.path.abspath("final_model"))
+    else:
+        print(f"Found {len(models)} models:")
+        for model in models:
+            print(f"- {model}: {model_paths[model]}")
+    return models, model_paths
+def load_model_from_checkpoint(model_path):
+    """Load model from checkpoint or final model directory"""
+    if model_path == "final_model":
+        # Load the final saved model
+        model = SmolLMModule(config)
+        model.model = AutoModelForCausalLM.from_pretrained(model_path)
+    else:
+        # Load from checkpoint
+        model = SmolLMModule.load_from_checkpoint(model_path, config=config)
+    model.eval()  # Set to evaluation mode
+    return model
+def generate_text(prompt, model_choice, max_length=100, temperature=0.7, top_p=0.9):
+    """Generate text based on prompt using selected model"""
+    # Check if model is selected
+    if not model_choice:
+        return "Please select a model checkpoint!"
     if not prompt:
         return "Please enter a prompt!"
     try:
+        # Get model path from the mapping
+        _, model_paths = get_available_checkpoints()
+        model_path = model_paths.get(model_choice)
+        if not model_path or not os.path.exists(model_path):
+            return f"Model {model_choice} not found!"
+        # Load model
+        model = load_model_from_checkpoint(model_path)
         # Move model to GPU if available
         device = "cuda" if torch.cuda.is_available() else "cpu"
         return f"Error during generation: {str(e)}"
+# Get available models
 display_names, _ = get_available_checkpoints()
 # Create Gradio interface
     gr.Markdown("# SmolLM2 Text Generation")
     if not display_names:
+        gr.Markdown("⚠️ No models found! Please train the model first.")
     else:
         gr.Markdown(
+            f"Found {len(display_names)} models/checkpoints. Select one and enter a prompt to generate text."
         )
+        gr.Markdown("Available models: " + ", ".join(display_names))
     with gr.Row():
         with gr.Column():
+            model_dropdown = gr.Dropdown(
                 choices=display_names,
+                label="Select Model",
                 value=display_names[-1] if display_names else None,
                 interactive=True,
             )
     generate_btn.click(
         fn=generate_text,
+        inputs=[prompt, model_dropdown, max_length, temperature, top_p],
         outputs=output,
     )

model.py CHANGED Viewed

@@ -1,203 +1,228 @@
-# import libraries
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
-from transformers import Trainer
-import pytorch_lightning as pl
-import yaml
-from pytorch_lightning.callbacks import LearningRateMonitor
-from pytorch_lightning.callbacks import RichProgressBar
 from pytorch_lightning.loggers import TensorBoardLogger
-import torch
-from torch.utils.data import DataLoader
-# load dataset
-dataset = load_dataset("HuggingFaceTB/smollm-corpus", "cosmopedia-v2", streaming=True)
-train_dataset = dataset["train"]
-for sample in train_dataset:
-    print(sample)
-    break
-# load tokenizer
-# use tokeniser from https://huggingface.co/HuggingFaceTB/cosmo2-tokenizer
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
-# Set padding token to be the same as EOS token
-tokenizer.pad_token = tokenizer.eos_token
-# load config
-# use config from https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config_smollm2_135M.yaml
-# config = AutoConfig.from_pretrained("HuggingFaceTB/SmolLM2-135M")
-def collate_fn(examples):
-    # Tokenize the texts
-    encoding = tokenizer(
-        [example["text"] for example in examples],
-        padding=True,
-        truncation=True,
-        max_length=512,
-        return_tensors="pt",
-    )
-    # Create labels (same as input_ids for causal language modeling)
-    encoding["labels"] = encoding["input_ids"].clone()
-    return encoding
-def create_model_config(config):
-    model_config = config["model"]["model_config"]
-    return LlamaConfig(
-        vocab_size=49152,  # From the model architecture
-        hidden_size=model_config["hidden_size"],
-        intermediate_size=model_config["intermediate_size"],
-        num_hidden_layers=model_config["num_hidden_layers"],
-        num_attention_heads=model_config["num_attention_heads"],
-        num_key_value_heads=model_config["num_key_value_heads"],
-        hidden_act=model_config["hidden_act"],
-        max_position_embeddings=model_config["max_position_embeddings"],
-        initializer_range=model_config["initializer_range"],
-        rms_norm_eps=1e-5,  # From the model architecture
-        use_cache=True,
-        pad_token_id=model_config["pad_token_id"],
-        bos_token_id=model_config["bos_token_id"],
-        eos_token_id=model_config["eos_token_id"],
     )
-# create model
-class SmolLMModule(pl.LightningModule):
     def __init__(self, config, learning_rate=1e-4):
         super().__init__()
         self.config = config
         self.learning_rate = learning_rate
-        self.save_hyperparameters()  # Save hyperparameters for resuming
-        # Create model from config
-        model_config = create_model_config(config)
         self.model = AutoModelForCausalLM.from_config(model_config)
-    def forward(self, **inputs):
-        return self.model(**inputs)
     def training_step(self, batch, batch_idx):
-        outputs = self.model(**batch)
         loss = outputs.loss
-        self.log("train_loss", loss, prog_bar=True)
         return loss
     def configure_optimizers(self):
-        optimizer = torch.optim.AdamW(
             self.model.parameters(),
             lr=self.learning_rate,
             betas=(0.9, 0.95),
             eps=1e-8,
-            weight_decay=0.1,
         )
-        return optimizer
-    def on_save_checkpoint(self, checkpoint):
-        # Save additional info if needed
-        checkpoint["step"] = self.global_step
-        checkpoint["model_config"] = self.config
-    def on_load_checkpoint(self, checkpoint):
-        # Restore additional info if needed
-        self.global_step = checkpoint["step"]
-        self.config = checkpoint["model_config"]
-# train model
-# save model
-# training script
 if __name__ == "__main__":
-    import os
-    from pytorch_lightning.callbacks import ModelCheckpoint
-    # parameters load from config file
-    with open("config_smollm2_135.yaml", "r") as file:
         config = yaml.safe_load(file)
-    max_steps = 5000  # Total training steps
-    # Create checkpoint directory if it doesn't exist
-    checkpoint_dir = "checkpoints"
-    os.makedirs(checkpoint_dir, exist_ok=True)
-    # Checkpoint callback
-    checkpoint_callback = ModelCheckpoint(
-        dirpath=checkpoint_dir,
-        filename="model-step={step}",
-        save_top_k=-1,  # Save all checkpoints
-        every_n_train_steps=500,  # Save every 500 steps
-        save_weights_only=False,  # Save the full model state
-    )
-    # load tokenizer
     tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
-    # Set padding token to be the same as EOS token
     tokenizer.pad_token = tokenizer.eos_token
-    # load dataset
     dataset = load_dataset(
         "HuggingFaceTB/smollm-corpus", "cosmopedia-v2", streaming=True
     )
     train_dataset = dataset["train"]
     # Create DataLoader
     train_loader = DataLoader(
-        train_dataset,
-        batch_size=4,  # Small batch size for testing
         collate_fn=collate_fn,
-        num_workers=2,
     )
-    # create model
-    model = SmolLMModule(config, learning_rate=1e-4)
-    # progress bar
-    progress_bar = RichProgressBar(leave=False, refresh_rate=1, console_kwargs=None)
-    # Find latest checkpoint if exists
-    latest_checkpoint = None
-    if os.path.exists(checkpoint_dir):
-        checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".ckpt")]
-        if checkpoints:
-            # Sort by step number and get the latest
-            latest_checkpoint = os.path.join(
-                checkpoint_dir,
-                sorted(checkpoints, key=lambda x: int(x.split("-")[1].split(".")[0]))[
-                    -1
-                ],
-            )
-            print(f"Resuming from checkpoint: {latest_checkpoint}")
-    # create trainer
-    trainer = pl.Trainer(
-        max_steps=max_steps,
         accelerator="gpu",
-        devices=1,
-        precision="bf16-mixed",
         callbacks=[
             LearningRateMonitor(logging_interval="step"),
             progress_bar,
             checkpoint_callback,
         ],
-        log_every_n_steps=1,
         enable_progress_bar=True,
         enable_model_summary=True,
     )
-    # train model
-    if latest_checkpoint:
-        # Resume training from checkpoint if it exists
-        trainer.fit(model, train_loader, ckpt_path=latest_checkpoint)
     else:
-        # Start training from scratch
-        trainer.fit(model, train_loader)
-    # Save final model and tokenizer
-    if trainer.is_global_zero:  # Only save on main process
         output_dir = "final_model"
         os.makedirs(output_dir, exist_ok=True)
-        model.model.save_pretrained(os.path.join(output_dir, "model"))
-        tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer"))

+import os
+import torch
+import yaml
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
+from torch.utils.data import DataLoader, IterableDataset
+from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import (
+    ModelCheckpoint,
+    LearningRateMonitor,
+    RichProgressBar,
+)
 from pytorch_lightning.loggers import TensorBoardLogger
+from torch.nn.utils.rnn import pad_sequence
+from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
+# Set environment variable for memory management
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Function to log GPU memory usage
+def log_memory_usage(step):
+    if torch.cuda.is_available():
+        print(
+            f"Step {step}: "
+            f"Allocated = {torch.cuda.memory_allocated() / 1e9:.2f} GB, "
+            f"Reserved = {torch.cuda.memory_reserved() / 1e9:.2f} GB"
+        )
+# Custom Collate Function
+def collate_fn(batch):
+    input_ids = [item["input_ids"] for item in batch]
+    labels = [item["labels"] for item in batch]
+    input_ids = pad_sequence(
+        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
+    )
+    labels = pad_sequence(
+        labels, batch_first=True, padding_value=tokenizer.pad_token_id
     )
+    return {"input_ids": input_ids, "labels": labels}
+# Streaming Dataset
+class StreamingDataset(IterableDataset):
+    def __init__(self, dataset, tokenizer, max_length=2048):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __iter__(self):
+        for example in iter(self.dataset):
+            tokenized = self.tokenizer(
+                example["text"],
+                truncation=True,
+                max_length=self.max_length,
+                return_overflowing_tokens=True,
+                return_tensors="pt",
+            )
+            for chunk in tokenized["input_ids"]:
+                yield {
+                    "input_ids": chunk.squeeze(0),
+                    "labels": chunk.squeeze(0),
+                }
+# Lightning Module
+class SmolLMModule(LightningModule):
     def __init__(self, config, learning_rate=1e-4):
         super().__init__()
         self.config = config
         self.learning_rate = learning_rate
+        self.save_hyperparameters()
+        model_config = LlamaConfig(
+            vocab_size=49152,
+            hidden_size=config["model"]["model_config"]["hidden_size"],
+            intermediate_size=config["model"]["model_config"]["intermediate_size"],
+            num_hidden_layers=config["model"]["model_config"]["num_hidden_layers"],
+            num_attention_heads=config["model"]["model_config"]["num_attention_heads"],
+            num_key_value_heads=config["model"]["model_config"]["num_key_value_heads"],
+            hidden_act=config["model"]["model_config"]["hidden_act"],
+            max_position_embeddings=config["model"]["model_config"][
+                "max_position_embeddings"
+            ],
+            initializer_range=config["model"]["model_config"]["initializer_range"],
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            pad_token_id=config["model"]["model_config"]["pad_token_id"],
+            bos_token_id=config["model"]["model_config"]["bos_token_id"],
+            eos_token_id=config["model"]["model_config"]["eos_token_id"],
+        )
         self.model = AutoModelForCausalLM.from_config(model_config)
     def training_step(self, batch, batch_idx):
+        outputs = self.model(input_ids=batch["input_ids"], labels=batch["labels"])
         loss = outputs.loss
+        self.log(
+            "train_loss", loss, prog_bar=True, on_step=True, on_epoch=True
+        )  # Log loss
+        # Log memory usage
+        if batch_idx % 10 == 0:
+            log_memory_usage(batch_idx)
+        # Release intermediate tensors
+        del outputs
+        torch.cuda.empty_cache()
         return loss
     def configure_optimizers(self):
+        return torch.optim.AdamW(
             self.model.parameters(),
             lr=self.learning_rate,
             betas=(0.9, 0.95),
             eps=1e-8,
+            weight_decay=self.config["optimizer"]["weight_decay"],
         )
+# Main Script
 if __name__ == "__main__":
+    # Load config
+    with open("/kaggle/input/yaml-file/config_smollm2_135.yaml", "r") as file:
         config = yaml.safe_load(file)
+    # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
     tokenizer.pad_token = tokenizer.eos_token
+    # Load dataset
     dataset = load_dataset(
         "HuggingFaceTB/smollm-corpus", "cosmopedia-v2", streaming=True
     )
     train_dataset = dataset["train"]
     # Create DataLoader
+    streaming_dataset = StreamingDataset(train_dataset, tokenizer, max_length=2048)
     train_loader = DataLoader(
+        streaming_dataset,
+        batch_size=1,  # Reduced batch size
+        num_workers=4,
         collate_fn=collate_fn,
+        pin_memory=True,
     )
+    # Create model
+    model = SmolLMModule(
+        config,
+        learning_rate=config["optimizer"]["learning_rate_scheduler"]["learning_rate"],
+    )
+    # Initialize logger with version based on start_step
+    logger = TensorBoardLogger("logs", name="smollm2")
+    # Checkpoint callback configuration
+    checkpoint_callback = ModelCheckpoint(
+        dirpath="checkpoints",
+        filename="model-{epoch:02d}-{step}-{train_loss:.2f}",  # Include training loss in filename
+        monitor="train_loss",  # Monitor training loss
+        mode="min",  # Lower loss is better
+        save_top_k=3,  # Save the best 3 models
+        save_last=True,  # Additionally save the last model
+        every_n_train_steps=500,  # Save every 500 steps
+        save_weights_only=False,  # Save the full model state
+        auto_insert_metric_name=False,  # Don't insert metric name in filename
+    )
+    # Progress bar
+    progress_bar = RichProgressBar(
+        refresh_rate=1,
+        leave=False,
+        theme=RichProgressBarTheme(
+            description="",
+            progress_bar="#6206E0",
+            progress_bar_finished="#6206E0",
+            progress_bar_pulse="#6206E0",
+            batch_progress="",
+            time="dim",
+            processing_speed="dim underline",
+            metrics="italic",
+            metrics_text_delimiter=" ",
+            metrics_format=".3f",
+        ),
+        console_kwargs=None,
+    )
+    # Create trainer
+    trainer = Trainer(
+        logger=logger,
+        strategy="ddp",
         accelerator="gpu",
+        devices=2,
+        precision="16-mixed",
+        max_steps=5000,
+        accumulate_grad_batches=1,
         callbacks=[
             LearningRateMonitor(logging_interval="step"),
             progress_bar,
             checkpoint_callback,
         ],
         enable_progress_bar=True,
         enable_model_summary=True,
+        log_every_n_steps=10,
     )
+    # Find latest checkpoint if exists
+    if os.path.exists("checkpoints/last.ckpt"):
+        resume_from_checkpoint = "checkpoints/last.ckpt"
+        print(f"Resuming from checkpoint: {resume_from_checkpoint}")
     else:
+        resume_from_checkpoint = None
+        print("Starting training from scratch")
+    # Train with automatic checkpoint resumption
+    trainer.fit(model, train_loader, ckpt_path=resume_from_checkpoint)
+    # After training, print the best model path and score
+    print(f"Best model path: {checkpoint_callback.best_model_path}")
+    print(f"Best train loss: {checkpoint_callback.best_model_score:.4f}")
+    # Save final model
+    if trainer.is_global_zero:
         output_dir = "final_model"
         os.makedirs(output_dir, exist_ok=True)
+        model.model.save_pretrained(output_dir)
+        tokenizer.save_pretrained(output_dir)