Training in progress, step 20

Browse files

Files changed (4) hide show

README.md +4 -5
config.json +59 -42
model.safetensors +3 -0
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,17 +1,16 @@
 ---
-base_model: open-r1/Qwen2.5-Math-7B-RoPE-300k
 library_name: transformers
 model_name: OpenR1-Distill-7B
 tags:
 - generated_from_trainer
-- sft
 - trl
 licence: license
 ---
 # Model Card for OpenR1-Distill-7B
-This model is a fine-tuned version of [open-r1/Qwen2.5-Math-7B-RoPE-300k](https://huggingface.co/open-r1/Qwen2.5-Math-7B-RoPE-300k).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -27,7 +26,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/eLLM-han2024/huggingface/runs/dmjnfl4m)
 This model was trained with SFT.
@@ -35,7 +34,7 @@ This model was trained with SFT.
 ### Framework versions
 - TRL: 0.21.0
-- Transformers: 4.55.2
 - Pytorch: 2.6.0
 - Datasets: 4.0.0
 - Tokenizers: 0.21.4

 ---
 library_name: transformers
 model_name: OpenR1-Distill-7B
 tags:
 - generated_from_trainer
 - trl
+- sft
 licence: license
 ---
 # Model Card for OpenR1-Distill-7B
+This model is a fine-tuned version of [None](https://huggingface.co/None).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
 This model was trained with SFT.
 ### Framework versions
 - TRL: 0.21.0
+- Transformers: 4.52.0
 - Pytorch: 2.6.0
 - Datasets: 4.0.0
 - Tokenizers: 0.21.4

config.json CHANGED Viewed

@@ -1,58 +1,75 @@
 {
   "architectures": [
-    "Qwen2ForCausalLM"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151643,
   "hidden_act": "silu",
-  "hidden_size": 3584,
   "initializer_range": 0.02,
-  "intermediate_size": 18944,
   "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
   ],
-  "max_position_embeddings": 32768,
   "max_window_layers": 28,
-  "model_type": "qwen2",
-  "num_attention_heads": 28,
   "num_hidden_layers": 28,
-  "num_key_value_heads": 4,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
-  "rope_theta": 300000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.55.2",
   "use_cache": false,
-  "use_sliding_window": false,
-  "vocab_size": 152064
 }

 {
   "architectures": [
+    "JetNemotronForCausalLM"
   ],
   "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_jet_nemotron.JetNemotronConfig",
+    "AutoModelForCausalLM": "modeling_jet_nemotron.JetNemotronForCausalLM"
+  },
   "bos_token_id": 151643,
+  "efficient_attention_config": {
+    "jet": {
+      "conv_size": 4,
+      "dconv_generator_reduction": 8,
+      "dconv_implementation": "triton",
+      "expand_v": 2.6666666667,
+      "head_dim": 96,
+      "mode": "chunk",
+      "norm_eps": "1e-5",
+      "num_heads": 12
+    },
+    "swa": {
+      "window_size": 1152
+    }
+  },
   "eos_token_id": 151643,
   "hidden_act": "silu",
+  "hidden_size": 1536,
   "initializer_range": 0.02,
+  "intermediate_size": 8960,
   "layer_types": [
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "attn",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "attn",
+    "swa",
+    "swa",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet",
+    "jet"
   ],
+  "max_position_embeddings": 131072,
   "max_window_layers": 28,
+  "model_type": "jet_nemotron",
+  "num_attention_heads": 12,
   "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.0",
   "use_cache": false,
+  "use_mrope": false,
+  "vocab_size": 151936
 }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94e573aed430d725e105c791328a08c61f30d189ded9761211d51a131d1ee91c
+size 3929669400

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fdac5900e93d7a750665b32f5877bded32b97ac5f0cfd0860f098cca48e6f393
-size 10936

 version https://git-lfs.github.com/spec/v1
+oid sha256:5d3d07c22082f5388113b72c838e7a28b001e3b79515139d54cf05dd7df22fea
+size 10872