gbcfchc commited on
Commit
d4d8d34
·
verified ·
1 Parent(s): e92eca6

Training in progress, step 20

Browse files
Files changed (4) hide show
  1. README.md +4 -5
  2. config.json +59 -42
  3. model.safetensors +3 -0
  4. training_args.bin +2 -2
README.md CHANGED
@@ -1,17 +1,16 @@
1
  ---
2
- base_model: open-r1/Qwen2.5-Math-7B-RoPE-300k
3
  library_name: transformers
4
  model_name: OpenR1-Distill-7B
5
  tags:
6
  - generated_from_trainer
7
- - sft
8
  - trl
 
9
  licence: license
10
  ---
11
 
12
  # Model Card for OpenR1-Distill-7B
13
 
14
- This model is a fine-tuned version of [open-r1/Qwen2.5-Math-7B-RoPE-300k](https://huggingface.co/open-r1/Qwen2.5-Math-7B-RoPE-300k).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
@@ -27,7 +26,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/eLLM-han2024/huggingface/runs/dmjnfl4m)
31
 
32
 
33
  This model was trained with SFT.
@@ -35,7 +34,7 @@ This model was trained with SFT.
35
  ### Framework versions
36
 
37
  - TRL: 0.21.0
38
- - Transformers: 4.55.2
39
  - Pytorch: 2.6.0
40
  - Datasets: 4.0.0
41
  - Tokenizers: 0.21.4
 
1
  ---
 
2
  library_name: transformers
3
  model_name: OpenR1-Distill-7B
4
  tags:
5
  - generated_from_trainer
 
6
  - trl
7
+ - sft
8
  licence: license
9
  ---
10
 
11
  # Model Card for OpenR1-Distill-7B
12
 
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
 
26
 
27
  ## Training procedure
28
 
29
+
30
 
31
 
32
  This model was trained with SFT.
 
34
  ### Framework versions
35
 
36
  - TRL: 0.21.0
37
+ - Transformers: 4.52.0
38
  - Pytorch: 2.6.0
39
  - Datasets: 4.0.0
40
  - Tokenizers: 0.21.4
config.json CHANGED
@@ -1,58 +1,75 @@
1
  {
2
  "architectures": [
3
- "Qwen2ForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
 
 
 
 
6
  "bos_token_id": 151643,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  "eos_token_id": 151643,
8
  "hidden_act": "silu",
9
- "hidden_size": 3584,
10
  "initializer_range": 0.02,
11
- "intermediate_size": 18944,
12
  "layer_types": [
13
- "full_attention",
14
- "full_attention",
15
- "full_attention",
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention"
41
  ],
42
- "max_position_embeddings": 32768,
43
  "max_window_layers": 28,
44
- "model_type": "qwen2",
45
- "num_attention_heads": 28,
46
  "num_hidden_layers": 28,
47
- "num_key_value_heads": 4,
48
  "rms_norm_eps": 1e-06,
49
  "rope_scaling": null,
50
- "rope_theta": 300000.0,
51
- "sliding_window": null,
52
- "tie_word_embeddings": false,
53
- "torch_dtype": "float32",
54
- "transformers_version": "4.55.2",
55
  "use_cache": false,
56
- "use_sliding_window": false,
57
- "vocab_size": 152064
58
  }
 
1
  {
2
  "architectures": [
3
+ "JetNemotronForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_jet_nemotron.JetNemotronConfig",
8
+ "AutoModelForCausalLM": "modeling_jet_nemotron.JetNemotronForCausalLM"
9
+ },
10
  "bos_token_id": 151643,
11
+ "efficient_attention_config": {
12
+ "jet": {
13
+ "conv_size": 4,
14
+ "dconv_generator_reduction": 8,
15
+ "dconv_implementation": "triton",
16
+ "expand_v": 2.6666666667,
17
+ "head_dim": 96,
18
+ "mode": "chunk",
19
+ "norm_eps": "1e-5",
20
+ "num_heads": 12
21
+ },
22
+ "swa": {
23
+ "window_size": 1152
24
+ }
25
+ },
26
  "eos_token_id": 151643,
27
  "hidden_act": "silu",
28
+ "hidden_size": 1536,
29
  "initializer_range": 0.02,
30
+ "intermediate_size": 8960,
31
  "layer_types": [
32
+ "jet",
33
+ "jet",
34
+ "jet",
35
+ "jet",
36
+ "jet",
37
+ "jet",
38
+ "jet",
39
+ "jet",
40
+ "jet",
41
+ "jet",
42
+ "jet",
43
+ "jet",
44
+ "jet",
45
+ "jet",
46
+ "attn",
47
+ "jet",
48
+ "jet",
49
+ "jet",
50
+ "jet",
51
+ "attn",
52
+ "swa",
53
+ "swa",
54
+ "jet",
55
+ "jet",
56
+ "jet",
57
+ "jet",
58
+ "jet",
59
+ "jet"
60
  ],
61
+ "max_position_embeddings": 131072,
62
  "max_window_layers": 28,
63
+ "model_type": "jet_nemotron",
64
+ "num_attention_heads": 12,
65
  "num_hidden_layers": 28,
66
+ "num_key_value_heads": 2,
67
  "rms_norm_eps": 1e-06,
68
  "rope_scaling": null,
69
+ "rope_theta": 1000000.0,
70
+ "torch_dtype": "bfloat16",
71
+ "transformers_version": "4.52.0",
 
 
72
  "use_cache": false,
73
+ "use_mrope": false,
74
+ "vocab_size": 151936
75
  }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e573aed430d725e105c791328a08c61f30d189ded9761211d51a131d1ee91c
3
+ size 3929669400
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdac5900e93d7a750665b32f5877bded32b97ac5f0cfd0860f098cca48e6f393
3
- size 10936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d3d07c22082f5388113b72c838e7a28b001e3b79515139d54cf05dd7df22fea
3
+ size 10872