kavyamanohar commited on
Commit
22d3680
·
verified ·
1 Parent(s): 109cd44

Upload fine-tuned Whisper model

Browse files
README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: openai/whisper-medium
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - wer
9
+ model-index:
10
+ - name: whisper-medium-ml-exp2
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # whisper-medium-ml-exp2
18
+
19
+ This model is a fine-tuned version of [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.2220
22
+ - Wer: 57.6922
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-05
42
+ - train_batch_size: 16
43
+ - eval_batch_size: 32
44
+ - seed: 42
45
+ - gradient_accumulation_steps: 2
46
+ - total_train_batch_size: 32
47
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: linear
49
+ - lr_scheduler_warmup_steps: 500
50
+ - training_steps: 15000
51
+ - mixed_precision_training: Native AMP
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss | Wer |
56
+ |:-------------:|:------:|:-----:|:---------------:|:-------:|
57
+ | 0.3119 | 0.0333 | 500 | 0.4193 | 79.0917 |
58
+ | 0.0405 | 0.0667 | 1000 | 0.4335 | 73.5830 |
59
+ | 0.0355 | 0.1 | 1500 | 0.4332 | 77.2785 |
60
+ | 0.126 | 0.1333 | 2000 | 0.1967 | 58.4021 |
61
+ | 0.0519 | 0.1667 | 2500 | 0.1861 | 58.5671 |
62
+ | 0.0439 | 0.2 | 3000 | 0.1942 | 57.4274 |
63
+ | 0.0534 | 1.0112 | 3500 | 0.1936 | 61.1497 |
64
+ | 0.0214 | 1.0445 | 4000 | 0.2253 | 59.7816 |
65
+ | 0.0129 | 1.0779 | 4500 | 0.2630 | 61.0614 |
66
+ | 0.048 | 1.1112 | 5000 | 0.1780 | 56.3606 |
67
+ | 0.047 | 1.1445 | 5500 | 0.1638 | 52.9951 |
68
+ | 0.0325 | 1.1779 | 6000 | 0.1683 | 54.5512 |
69
+ | 0.0293 | 1.2112 | 6500 | 0.1689 | 57.2451 |
70
+ | 0.028 | 2.0224 | 7000 | 0.2145 | 56.5237 |
71
+ | 0.009 | 2.0557 | 7500 | 0.2227 | 56.3068 |
72
+ | 0.0076 | 2.0891 | 8000 | 0.2750 | 66.0540 |
73
+ | 0.0385 | 2.1224 | 8500 | 0.2178 | 54.4514 |
74
+ | 0.0245 | 2.1557 | 9000 | 0.1721 | 52.0031 |
75
+ | 0.0226 | 2.1891 | 9500 | 0.1741 | 53.7511 |
76
+ | 0.0212 | 3.0003 | 10000 | 0.2001 | 56.1495 |
77
+ | 0.0121 | 3.0336 | 10500 | 0.2322 | 55.4722 |
78
+ | 0.0042 | 3.0669 | 11000 | 0.2403 | 57.6864 |
79
+ | 0.0059 | 3.1003 | 11500 | 0.2953 | 64.0067 |
80
+ | 0.0248 | 3.1336 | 12000 | 0.1744 | 51.3412 |
81
+ | 0.0172 | 3.1669 | 12500 | 0.1872 | 53.5324 |
82
+ | 0.015 | 3.2003 | 13000 | 0.1930 | 54.7028 |
83
+ | 0.0158 | 4.0115 | 13500 | 0.2173 | 60.9636 |
84
+ | 0.0028 | 4.0448 | 14000 | 0.2330 | 53.4921 |
85
+ | 0.0028 | 4.0781 | 14500 | 0.2415 | 53.4767 |
86
+ | 0.0194 | 4.1115 | 15000 | 0.2220 | 57.6922 |
87
+
88
+
89
+ ### Framework versions
90
+
91
+ - Transformers 4.51.1
92
+ - Pytorch 2.6.0+cu124
93
+ - Datasets 3.5.0
94
+ - Tokenizers 0.21.1
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.111466666666667,
3
+ "total_flos": 4.898454489936691e+20,
4
+ "train_loss": 0.03751766018072764,
5
+ "train_runtime": 129332.0882,
6
+ "train_samples_per_second": 3.711,
7
+ "train_steps_per_second": 0.116
8
+ }
generation_config.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 13,
5
+ 15
6
+ ],
7
+ [
8
+ 15,
9
+ 4
10
+ ],
11
+ [
12
+ 15,
13
+ 15
14
+ ],
15
+ [
16
+ 16,
17
+ 1
18
+ ],
19
+ [
20
+ 20,
21
+ 0
22
+ ],
23
+ [
24
+ 23,
25
+ 4
26
+ ]
27
+ ],
28
+ "begin_suppress_tokens": [
29
+ 220,
30
+ 50257
31
+ ],
32
+ "bos_token_id": 50257,
33
+ "decoder_start_token_id": 50258,
34
+ "eos_token_id": 50257,
35
+ "is_multilingual": true,
36
+ "lang_to_id": {
37
+ "<|af|>": 50327,
38
+ "<|am|>": 50334,
39
+ "<|ar|>": 50272,
40
+ "<|as|>": 50350,
41
+ "<|az|>": 50304,
42
+ "<|ba|>": 50355,
43
+ "<|be|>": 50330,
44
+ "<|bg|>": 50292,
45
+ "<|bn|>": 50302,
46
+ "<|bo|>": 50347,
47
+ "<|br|>": 50309,
48
+ "<|bs|>": 50315,
49
+ "<|ca|>": 50270,
50
+ "<|cs|>": 50283,
51
+ "<|cy|>": 50297,
52
+ "<|da|>": 50285,
53
+ "<|de|>": 50261,
54
+ "<|el|>": 50281,
55
+ "<|en|>": 50259,
56
+ "<|es|>": 50262,
57
+ "<|et|>": 50307,
58
+ "<|eu|>": 50310,
59
+ "<|fa|>": 50300,
60
+ "<|fi|>": 50277,
61
+ "<|fo|>": 50338,
62
+ "<|fr|>": 50265,
63
+ "<|gl|>": 50319,
64
+ "<|gu|>": 50333,
65
+ "<|haw|>": 50352,
66
+ "<|ha|>": 50354,
67
+ "<|he|>": 50279,
68
+ "<|hi|>": 50276,
69
+ "<|hr|>": 50291,
70
+ "<|ht|>": 50339,
71
+ "<|hu|>": 50286,
72
+ "<|hy|>": 50312,
73
+ "<|id|>": 50275,
74
+ "<|is|>": 50311,
75
+ "<|it|>": 50274,
76
+ "<|ja|>": 50266,
77
+ "<|jw|>": 50356,
78
+ "<|ka|>": 50329,
79
+ "<|kk|>": 50316,
80
+ "<|km|>": 50323,
81
+ "<|kn|>": 50306,
82
+ "<|ko|>": 50264,
83
+ "<|la|>": 50294,
84
+ "<|lb|>": 50345,
85
+ "<|ln|>": 50353,
86
+ "<|lo|>": 50336,
87
+ "<|lt|>": 50293,
88
+ "<|lv|>": 50301,
89
+ "<|mg|>": 50349,
90
+ "<|mi|>": 50295,
91
+ "<|mk|>": 50308,
92
+ "<|ml|>": 50296,
93
+ "<|mn|>": 50314,
94
+ "<|mr|>": 50320,
95
+ "<|ms|>": 50282,
96
+ "<|mt|>": 50343,
97
+ "<|my|>": 50346,
98
+ "<|ne|>": 50313,
99
+ "<|nl|>": 50271,
100
+ "<|nn|>": 50342,
101
+ "<|no|>": 50288,
102
+ "<|oc|>": 50328,
103
+ "<|pa|>": 50321,
104
+ "<|pl|>": 50269,
105
+ "<|ps|>": 50340,
106
+ "<|pt|>": 50267,
107
+ "<|ro|>": 50284,
108
+ "<|ru|>": 50263,
109
+ "<|sa|>": 50344,
110
+ "<|sd|>": 50332,
111
+ "<|si|>": 50322,
112
+ "<|sk|>": 50298,
113
+ "<|sl|>": 50305,
114
+ "<|sn|>": 50324,
115
+ "<|so|>": 50326,
116
+ "<|sq|>": 50317,
117
+ "<|sr|>": 50303,
118
+ "<|su|>": 50357,
119
+ "<|sv|>": 50273,
120
+ "<|sw|>": 50318,
121
+ "<|ta|>": 50287,
122
+ "<|te|>": 50299,
123
+ "<|tg|>": 50331,
124
+ "<|th|>": 50289,
125
+ "<|tk|>": 50341,
126
+ "<|tl|>": 50348,
127
+ "<|tr|>": 50268,
128
+ "<|tt|>": 50351,
129
+ "<|uk|>": 50280,
130
+ "<|ur|>": 50290,
131
+ "<|uz|>": 50337,
132
+ "<|vi|>": 50278,
133
+ "<|yi|>": 50335,
134
+ "<|yo|>": 50325,
135
+ "<|zh|>": 50260
136
+ },
137
+ "language": "malayalam",
138
+ "max_initial_timestamp_index": 50,
139
+ "max_length": 448,
140
+ "no_timestamps_token_id": 50363,
141
+ "pad_token_id": 50257,
142
+ "prev_sot_token_id": 50361,
143
+ "return_timestamps": false,
144
+ "suppress_tokens": [],
145
+ "task": "transcribe",
146
+ "task_to_id": {
147
+ "transcribe": 50359,
148
+ "translate": 50358
149
+ },
150
+ "transformers_version": "4.51.1"
151
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1007632b1ea26e1c57c2dc85977986fec52a3f7c39e63fda252d9702ac0716b2
3
  size 3055544304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78712a3faaee1db1b1f5ebd73115e2b0473adcbd34eb3372c7892d73efc4bc84
3
  size 3055544304
runs/Apr25_07-27-04_whisper-a100/events.out.tfevents.1745566026.whisper-a100.411822.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01dd7e393e8d07260f32c0c970294f8661acbe26b09805b90a598f5731b2624f
3
- size 20641
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75a68c50eed76c0621998639ec362dc4bcb375915b03872c47d718cb14228dd4
3
+ size 22053
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.111466666666667,
3
+ "total_flos": 4.898454489936691e+20,
4
+ "train_loss": 0.03751766018072764,
5
+ "train_runtime": 129332.0882,
6
+ "train_samples_per_second": 3.711,
7
+ "train_steps_per_second": 0.116
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 12000,
3
+ "best_metric": 51.341187305729306,
4
+ "best_model_checkpoint": "./whisper-medium-ml-exp2/checkpoint-12000",
5
+ "epoch": 4.111466666666667,
6
+ "eval_steps": 500,
7
+ "global_step": 15000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03333333333333333,
14
+ "grad_norm": 1.354724407196045,
15
+ "learning_rate": 9.940000000000001e-06,
16
+ "loss": 0.3119,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.03333333333333333,
21
+ "eval_loss": 0.4192824959754944,
22
+ "eval_runtime": 3090.4729,
23
+ "eval_samples_per_second": 1.902,
24
+ "eval_steps_per_second": 0.06,
25
+ "eval_wer": 79.09167658006831,
26
+ "step": 500
27
+ },
28
+ {
29
+ "epoch": 0.06666666666666667,
30
+ "grad_norm": 0.926690399646759,
31
+ "learning_rate": 9.657241379310346e-06,
32
+ "loss": 0.0405,
33
+ "step": 1000
34
+ },
35
+ {
36
+ "epoch": 0.06666666666666667,
37
+ "eval_loss": 0.4335130751132965,
38
+ "eval_runtime": 2992.1696,
39
+ "eval_samples_per_second": 1.964,
40
+ "eval_steps_per_second": 0.061,
41
+ "eval_wer": 73.58302313979816,
42
+ "step": 1000
43
+ },
44
+ {
45
+ "epoch": 0.1,
46
+ "grad_norm": 0.8192572593688965,
47
+ "learning_rate": 9.312413793103448e-06,
48
+ "loss": 0.0355,
49
+ "step": 1500
50
+ },
51
+ {
52
+ "epoch": 0.1,
53
+ "eval_loss": 0.43317151069641113,
54
+ "eval_runtime": 3082.3837,
55
+ "eval_samples_per_second": 1.907,
56
+ "eval_steps_per_second": 0.06,
57
+ "eval_wer": 77.27848344142139,
58
+ "step": 1500
59
+ },
60
+ {
61
+ "epoch": 0.13333333333333333,
62
+ "grad_norm": 1.0174602270126343,
63
+ "learning_rate": 8.967586206896553e-06,
64
+ "loss": 0.126,
65
+ "step": 2000
66
+ },
67
+ {
68
+ "epoch": 0.13333333333333333,
69
+ "eval_loss": 0.19669494032859802,
70
+ "eval_runtime": 3003.1726,
71
+ "eval_samples_per_second": 1.957,
72
+ "eval_steps_per_second": 0.061,
73
+ "eval_wer": 58.40208757051306,
74
+ "step": 2000
75
+ },
76
+ {
77
+ "epoch": 0.16666666666666666,
78
+ "grad_norm": 1.0093193054199219,
79
+ "learning_rate": 8.622758620689657e-06,
80
+ "loss": 0.0519,
81
+ "step": 2500
82
+ },
83
+ {
84
+ "epoch": 0.16666666666666666,
85
+ "eval_loss": 0.18607404828071594,
86
+ "eval_runtime": 3062.2455,
87
+ "eval_samples_per_second": 1.92,
88
+ "eval_steps_per_second": 0.06,
89
+ "eval_wer": 58.567097739744426,
90
+ "step": 2500
91
+ },
92
+ {
93
+ "epoch": 0.2,
94
+ "grad_norm": 1.2591651678085327,
95
+ "learning_rate": 8.27793103448276e-06,
96
+ "loss": 0.0439,
97
+ "step": 3000
98
+ },
99
+ {
100
+ "epoch": 0.2,
101
+ "eval_loss": 0.19417671859264374,
102
+ "eval_runtime": 2997.3111,
103
+ "eval_samples_per_second": 1.961,
104
+ "eval_steps_per_second": 0.061,
105
+ "eval_wer": 57.42737633830922,
106
+ "step": 3000
107
+ },
108
+ {
109
+ "epoch": 1.0112,
110
+ "grad_norm": 1.0831571817398071,
111
+ "learning_rate": 7.933103448275864e-06,
112
+ "loss": 0.0534,
113
+ "step": 3500
114
+ },
115
+ {
116
+ "epoch": 1.0112,
117
+ "eval_loss": 0.193574920296669,
118
+ "eval_runtime": 2956.9818,
119
+ "eval_samples_per_second": 1.988,
120
+ "eval_steps_per_second": 0.062,
121
+ "eval_wer": 61.14969876050501,
122
+ "step": 3500
123
+ },
124
+ {
125
+ "epoch": 1.0445333333333333,
126
+ "grad_norm": 1.4980469942092896,
127
+ "learning_rate": 7.588275862068966e-06,
128
+ "loss": 0.0214,
129
+ "step": 4000
130
+ },
131
+ {
132
+ "epoch": 1.0445333333333333,
133
+ "eval_loss": 0.22530874609947205,
134
+ "eval_runtime": 3078.9727,
135
+ "eval_samples_per_second": 1.909,
136
+ "eval_steps_per_second": 0.06,
137
+ "eval_wer": 59.78164933420316,
138
+ "step": 4000
139
+ },
140
+ {
141
+ "epoch": 1.0778666666666668,
142
+ "grad_norm": 0.5621655583381653,
143
+ "learning_rate": 7.243448275862069e-06,
144
+ "loss": 0.0129,
145
+ "step": 4500
146
+ },
147
+ {
148
+ "epoch": 1.0778666666666668,
149
+ "eval_loss": 0.26299336552619934,
150
+ "eval_runtime": 2998.2811,
151
+ "eval_samples_per_second": 1.96,
152
+ "eval_steps_per_second": 0.061,
153
+ "eval_wer": 61.061437507195215,
154
+ "step": 4500
155
+ },
156
+ {
157
+ "epoch": 1.1112,
158
+ "grad_norm": 1.5936987400054932,
159
+ "learning_rate": 6.9e-06,
160
+ "loss": 0.048,
161
+ "step": 5000
162
+ },
163
+ {
164
+ "epoch": 1.1112,
165
+ "eval_loss": 0.17795228958129883,
166
+ "eval_runtime": 2987.0925,
167
+ "eval_samples_per_second": 1.968,
168
+ "eval_steps_per_second": 0.062,
169
+ "eval_wer": 56.3605664069995,
170
+ "step": 5000
171
+ },
172
+ {
173
+ "epoch": 1.1445333333333334,
174
+ "grad_norm": 0.48164331912994385,
175
+ "learning_rate": 6.555172413793104e-06,
176
+ "loss": 0.047,
177
+ "step": 5500
178
+ },
179
+ {
180
+ "epoch": 1.1445333333333334,
181
+ "eval_loss": 0.16377924382686615,
182
+ "eval_runtime": 2984.4656,
183
+ "eval_samples_per_second": 1.97,
184
+ "eval_steps_per_second": 0.062,
185
+ "eval_wer": 52.99512644383898,
186
+ "step": 5500
187
+ },
188
+ {
189
+ "epoch": 1.1778666666666666,
190
+ "grad_norm": 0.45277634263038635,
191
+ "learning_rate": 6.2103448275862075e-06,
192
+ "loss": 0.0325,
193
+ "step": 6000
194
+ },
195
+ {
196
+ "epoch": 1.1778666666666666,
197
+ "eval_loss": 0.16828514635562897,
198
+ "eval_runtime": 3034.653,
199
+ "eval_samples_per_second": 1.937,
200
+ "eval_steps_per_second": 0.061,
201
+ "eval_wer": 54.55121071414867,
202
+ "step": 6000
203
+ },
204
+ {
205
+ "epoch": 1.2112,
206
+ "grad_norm": 0.6970316767692566,
207
+ "learning_rate": 5.865517241379311e-06,
208
+ "loss": 0.0293,
209
+ "step": 6500
210
+ },
211
+ {
212
+ "epoch": 1.2112,
213
+ "eval_loss": 0.16886456310749054,
214
+ "eval_runtime": 3048.6335,
215
+ "eval_samples_per_second": 1.928,
216
+ "eval_steps_per_second": 0.06,
217
+ "eval_wer": 57.24509766299551,
218
+ "step": 6500
219
+ },
220
+ {
221
+ "epoch": 2.0224,
222
+ "grad_norm": 0.521206259727478,
223
+ "learning_rate": 5.520689655172414e-06,
224
+ "loss": 0.028,
225
+ "step": 7000
226
+ },
227
+ {
228
+ "epoch": 2.0224,
229
+ "eval_loss": 0.21454408764839172,
230
+ "eval_runtime": 2924.4034,
231
+ "eval_samples_per_second": 2.01,
232
+ "eval_steps_per_second": 0.063,
233
+ "eval_wer": 56.52365785333282,
234
+ "step": 7000
235
+ },
236
+ {
237
+ "epoch": 2.0557333333333334,
238
+ "grad_norm": 0.5115911364555359,
239
+ "learning_rate": 5.175862068965518e-06,
240
+ "loss": 0.009,
241
+ "step": 7500
242
+ },
243
+ {
244
+ "epoch": 2.0557333333333334,
245
+ "eval_loss": 0.22271297872066498,
246
+ "eval_runtime": 2915.2649,
247
+ "eval_samples_per_second": 2.016,
248
+ "eval_steps_per_second": 0.063,
249
+ "eval_wer": 56.30684216585441,
250
+ "step": 7500
251
+ },
252
+ {
253
+ "epoch": 2.0890666666666666,
254
+ "grad_norm": 0.5929153561592102,
255
+ "learning_rate": 4.831034482758621e-06,
256
+ "loss": 0.0076,
257
+ "step": 8000
258
+ },
259
+ {
260
+ "epoch": 2.0890666666666666,
261
+ "eval_loss": 0.2749842405319214,
262
+ "eval_runtime": 3041.7569,
263
+ "eval_samples_per_second": 1.932,
264
+ "eval_steps_per_second": 0.06,
265
+ "eval_wer": 66.05395448789287,
266
+ "step": 8000
267
+ },
268
+ {
269
+ "epoch": 2.1224,
270
+ "grad_norm": 0.3881845474243164,
271
+ "learning_rate": 4.486206896551725e-06,
272
+ "loss": 0.0385,
273
+ "step": 8500
274
+ },
275
+ {
276
+ "epoch": 2.1224,
277
+ "eval_loss": 0.21780993044376373,
278
+ "eval_runtime": 2981.8862,
279
+ "eval_samples_per_second": 1.971,
280
+ "eval_steps_per_second": 0.062,
281
+ "eval_wer": 54.45143712345063,
282
+ "step": 8500
283
+ },
284
+ {
285
+ "epoch": 2.1557333333333335,
286
+ "grad_norm": 0.688723623752594,
287
+ "learning_rate": 4.141379310344828e-06,
288
+ "loss": 0.0245,
289
+ "step": 9000
290
+ },
291
+ {
292
+ "epoch": 2.1557333333333335,
293
+ "eval_loss": 0.1720988005399704,
294
+ "eval_runtime": 3050.0463,
295
+ "eval_samples_per_second": 1.927,
296
+ "eval_steps_per_second": 0.06,
297
+ "eval_wer": 52.00314670555278,
298
+ "step": 9000
299
+ },
300
+ {
301
+ "epoch": 2.1890666666666667,
302
+ "grad_norm": 0.39908483624458313,
303
+ "learning_rate": 3.7965517241379313e-06,
304
+ "loss": 0.0226,
305
+ "step": 9500
306
+ },
307
+ {
308
+ "epoch": 2.1890666666666667,
309
+ "eval_loss": 0.1741122305393219,
310
+ "eval_runtime": 3209.6808,
311
+ "eval_samples_per_second": 1.831,
312
+ "eval_steps_per_second": 0.057,
313
+ "eval_wer": 53.75110326566638,
314
+ "step": 9500
315
+ },
316
+ {
317
+ "epoch": 3.0002666666666666,
318
+ "grad_norm": 2.697366237640381,
319
+ "learning_rate": 3.4517241379310346e-06,
320
+ "loss": 0.0212,
321
+ "step": 10000
322
+ },
323
+ {
324
+ "epoch": 3.0002666666666666,
325
+ "eval_loss": 0.20012931525707245,
326
+ "eval_runtime": 3160.8774,
327
+ "eval_samples_per_second": 1.86,
328
+ "eval_steps_per_second": 0.058,
329
+ "eval_wer": 56.14950688821521,
330
+ "step": 10000
331
+ },
332
+ {
333
+ "epoch": 3.0336,
334
+ "grad_norm": 0.3792371451854706,
335
+ "learning_rate": 3.1068965517241384e-06,
336
+ "loss": 0.0121,
337
+ "step": 10500
338
+ },
339
+ {
340
+ "epoch": 3.0336,
341
+ "eval_loss": 0.23216772079467773,
342
+ "eval_runtime": 3109.7166,
343
+ "eval_samples_per_second": 1.89,
344
+ "eval_steps_per_second": 0.059,
345
+ "eval_wer": 55.472197705207414,
346
+ "step": 10500
347
+ },
348
+ {
349
+ "epoch": 3.0669333333333335,
350
+ "grad_norm": 0.2577882409095764,
351
+ "learning_rate": 2.7620689655172417e-06,
352
+ "loss": 0.0042,
353
+ "step": 11000
354
+ },
355
+ {
356
+ "epoch": 3.0669333333333335,
357
+ "eval_loss": 0.24030156433582306,
358
+ "eval_runtime": 3127.3687,
359
+ "eval_samples_per_second": 1.88,
360
+ "eval_steps_per_second": 0.059,
361
+ "eval_wer": 57.6864039295445,
362
+ "step": 11000
363
+ },
364
+ {
365
+ "epoch": 3.1002666666666667,
366
+ "grad_norm": 0.24431835114955902,
367
+ "learning_rate": 2.4179310344827587e-06,
368
+ "loss": 0.0059,
369
+ "step": 11500
370
+ },
371
+ {
372
+ "epoch": 3.1002666666666667,
373
+ "eval_loss": 0.2953338325023651,
374
+ "eval_runtime": 3087.8251,
375
+ "eval_samples_per_second": 1.904,
376
+ "eval_steps_per_second": 0.06,
377
+ "eval_wer": 64.00667715568518,
378
+ "step": 11500
379
+ },
380
+ {
381
+ "epoch": 3.1336,
382
+ "grad_norm": 0.6591205596923828,
383
+ "learning_rate": 2.073793103448276e-06,
384
+ "loss": 0.0248,
385
+ "step": 12000
386
+ },
387
+ {
388
+ "epoch": 3.1336,
389
+ "eval_loss": 0.1744297742843628,
390
+ "eval_runtime": 2988.8074,
391
+ "eval_samples_per_second": 1.967,
392
+ "eval_steps_per_second": 0.062,
393
+ "eval_wer": 51.341187305729306,
394
+ "step": 12000
395
+ },
396
+ {
397
+ "epoch": 3.166933333333333,
398
+ "grad_norm": 0.45652803778648376,
399
+ "learning_rate": 1.7289655172413794e-06,
400
+ "loss": 0.0172,
401
+ "step": 12500
402
+ },
403
+ {
404
+ "epoch": 3.166933333333333,
405
+ "eval_loss": 0.18724997341632843,
406
+ "eval_runtime": 3015.7946,
407
+ "eval_samples_per_second": 1.949,
408
+ "eval_steps_per_second": 0.061,
409
+ "eval_wer": 53.53236885528992,
410
+ "step": 12500
411
+ },
412
+ {
413
+ "epoch": 3.200266666666667,
414
+ "grad_norm": 0.42330440878868103,
415
+ "learning_rate": 1.384137931034483e-06,
416
+ "loss": 0.015,
417
+ "step": 13000
418
+ },
419
+ {
420
+ "epoch": 3.200266666666667,
421
+ "eval_loss": 0.19304682314395905,
422
+ "eval_runtime": 3065.8492,
423
+ "eval_samples_per_second": 1.917,
424
+ "eval_steps_per_second": 0.06,
425
+ "eval_wer": 54.702789823093745,
426
+ "step": 13000
427
+ },
428
+ {
429
+ "epoch": 4.011466666666666,
430
+ "grad_norm": 0.7199889421463013,
431
+ "learning_rate": 1.0393103448275863e-06,
432
+ "loss": 0.0158,
433
+ "step": 13500
434
+ },
435
+ {
436
+ "epoch": 4.011466666666666,
437
+ "eval_loss": 0.21734359860420227,
438
+ "eval_runtime": 3066.4573,
439
+ "eval_samples_per_second": 1.917,
440
+ "eval_steps_per_second": 0.06,
441
+ "eval_wer": 60.96358263939522,
442
+ "step": 13500
443
+ },
444
+ {
445
+ "epoch": 4.0448,
446
+ "grad_norm": 0.42120951414108276,
447
+ "learning_rate": 6.944827586206897e-07,
448
+ "loss": 0.0028,
449
+ "step": 14000
450
+ },
451
+ {
452
+ "epoch": 4.0448,
453
+ "eval_loss": 0.23296251893043518,
454
+ "eval_runtime": 2966.4298,
455
+ "eval_samples_per_second": 1.982,
456
+ "eval_steps_per_second": 0.062,
457
+ "eval_wer": 53.49207567443109,
458
+ "step": 14000
459
+ },
460
+ {
461
+ "epoch": 4.078133333333334,
462
+ "grad_norm": 0.202627032995224,
463
+ "learning_rate": 3.496551724137931e-07,
464
+ "loss": 0.0028,
465
+ "step": 14500
466
+ },
467
+ {
468
+ "epoch": 4.078133333333334,
469
+ "eval_loss": 0.24154414236545563,
470
+ "eval_runtime": 2989.4533,
471
+ "eval_samples_per_second": 1.966,
472
+ "eval_steps_per_second": 0.062,
473
+ "eval_wer": 53.47672589124679,
474
+ "step": 14500
475
+ },
476
+ {
477
+ "epoch": 4.111466666666667,
478
+ "grad_norm": 2.03951358795166,
479
+ "learning_rate": 4.827586206896552e-09,
480
+ "loss": 0.0194,
481
+ "step": 15000
482
+ },
483
+ {
484
+ "epoch": 4.111466666666667,
485
+ "eval_loss": 0.22199244797229767,
486
+ "eval_runtime": 3069.065,
487
+ "eval_samples_per_second": 1.915,
488
+ "eval_steps_per_second": 0.06,
489
+ "eval_wer": 57.69216009823861,
490
+ "step": 15000
491
+ },
492
+ {
493
+ "epoch": 4.111466666666667,
494
+ "step": 15000,
495
+ "total_flos": 4.898454489936691e+20,
496
+ "train_loss": 0.03751766018072764,
497
+ "train_runtime": 129332.0882,
498
+ "train_samples_per_second": 3.711,
499
+ "train_steps_per_second": 0.116
500
+ }
501
+ ],
502
+ "logging_steps": 500,
503
+ "max_steps": 15000,
504
+ "num_input_tokens_seen": 0,
505
+ "num_train_epochs": 9223372036854775807,
506
+ "save_steps": 1000,
507
+ "stateful_callbacks": {
508
+ "TrainerControl": {
509
+ "args": {
510
+ "should_epoch_stop": false,
511
+ "should_evaluate": false,
512
+ "should_log": false,
513
+ "should_save": true,
514
+ "should_training_stop": true
515
+ },
516
+ "attributes": {}
517
+ }
518
+ },
519
+ "total_flos": 4.898454489936691e+20,
520
+ "train_batch_size": 16,
521
+ "trial_name": null,
522
+ "trial_params": null
523
+ }