jumelet commited on
Commit
1dde150
·
verified ·
1 Parent(s): c3f7d71

Add main & ema weights for jpn

Browse files
Files changed (3) hide show
  1. README.md +5 -5
  2. config.json +4 -4
  3. original_project_config.json +4 -4
README.md CHANGED
@@ -27,13 +27,13 @@ ema, main
27
  {
28
  "attention_probs_dropout_prob": 0.1,
29
  "hidden_dropout_prob": 0.1,
30
- "hidden_size": 384,
31
- "intermediate_size": 1280,
32
  "max_position_embeddings": 512,
33
  "position_bucket_size": 32,
34
- "num_attention_heads": 6,
35
  "num_hidden_layers": 12,
36
- "vocab_size": 8192,
37
  "layer_norm_eps": 1e-05,
38
  "force_causal_mask": true,
39
  "classifier_dropout": 0.1,
@@ -68,6 +68,6 @@ print(outputs.logits)
68
  ```
69
 
70
  ## Notes
71
- - Converted on 2025-10-07T00:19:03.971199+00:00
72
  - Weights are the exact trained parameters; no new layers were initialized.
73
  - Requires `trust_remote_code=True` due to custom architecture.
 
27
  {
28
  "attention_probs_dropout_prob": 0.1,
29
  "hidden_dropout_prob": 0.1,
30
+ "hidden_size": 768,
31
+ "intermediate_size": 2560,
32
  "max_position_embeddings": 512,
33
  "position_bucket_size": 32,
34
+ "num_attention_heads": 12,
35
  "num_hidden_layers": 12,
36
+ "vocab_size": 16384,
37
  "layer_norm_eps": 1e-05,
38
  "force_causal_mask": true,
39
  "classifier_dropout": 0.1,
 
68
  ```
69
 
70
  ## Notes
71
+ - Converted on 2025-10-07T01:14:06.712805+00:00
72
  - Weights are the exact trained parameters; no new layers were initialized.
73
  - Requires `trust_remote_code=True` due to custom architecture.
config.json CHANGED
@@ -18,16 +18,16 @@
18
  "eos_token_id": 2,
19
  "force_causal_mask": true,
20
  "hidden_dropout_prob": 0.1,
21
- "hidden_size": 384,
22
- "intermediate_size": 1280,
23
  "layer_norm_eps": 1e-05,
24
  "mask_token_id": 4,
25
  "max_position_embeddings": 512,
26
  "model_type": "gpt_bert",
27
- "num_attention_heads": 6,
28
  "num_hidden_layers": 12,
29
  "num_labels": 2,
30
  "pad_token_id": 3,
31
  "position_bucket_size": 32,
32
- "vocab_size": 8192
33
  }
 
18
  "eos_token_id": 2,
19
  "force_causal_mask": true,
20
  "hidden_dropout_prob": 0.1,
21
+ "hidden_size": 768,
22
+ "intermediate_size": 2560,
23
  "layer_norm_eps": 1e-05,
24
  "mask_token_id": 4,
25
  "max_position_embeddings": 512,
26
  "model_type": "gpt_bert",
27
+ "num_attention_heads": 12,
28
  "num_hidden_layers": 12,
29
  "num_labels": 2,
30
  "pad_token_id": 3,
31
  "position_bucket_size": 32,
32
+ "vocab_size": 16384
33
  }
original_project_config.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "attention_probs_dropout_prob": 0.1,
3
  "hidden_dropout_prob": 0.1,
4
- "hidden_size": 384,
5
- "intermediate_size": 1280,
6
  "max_position_embeddings": 512,
7
  "position_bucket_size": 32,
8
- "num_attention_heads": 6,
9
  "num_hidden_layers": 12,
10
- "vocab_size": 8192,
11
  "layer_norm_eps": 1e-05,
12
  "force_causal_mask": true,
13
  "classifier_dropout": 0.1,
 
1
  {
2
  "attention_probs_dropout_prob": 0.1,
3
  "hidden_dropout_prob": 0.1,
4
+ "hidden_size": 768,
5
+ "intermediate_size": 2560,
6
  "max_position_embeddings": 512,
7
  "position_bucket_size": 32,
8
+ "num_attention_heads": 12,
9
  "num_hidden_layers": 12,
10
+ "vocab_size": 16384,
11
  "layer_norm_eps": 1e-05,
12
  "force_causal_mask": true,
13
  "classifier_dropout": 0.1,