Aratako/reward-test-modernbert-310m
Browse files- README.md +180 -0
- config.json +54 -0
- model.safetensors +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +171 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
license: mit
|
4 |
+
base_model: sbintuitions/modernbert-ja-310m
|
5 |
+
tags:
|
6 |
+
- generated_from_trainer
|
7 |
+
metrics:
|
8 |
+
- pearsonr
|
9 |
+
- spearmanr
|
10 |
+
model-index:
|
11 |
+
- name: test-clf-modernbert-310m
|
12 |
+
results: []
|
13 |
+
---
|
14 |
+
|
15 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
16 |
+
should probably proofread and complete it, then remove this comment. -->
|
17 |
+
|
18 |
+
# test-clf-modernbert-310m
|
19 |
+
|
20 |
+
This model is a fine-tuned version of [sbintuitions/modernbert-ja-310m](https://huggingface.co/sbintuitions/modernbert-ja-310m) on an unknown dataset.
|
21 |
+
It achieves the following results on the evaluation set:
|
22 |
+
- Loss: 1.0892
|
23 |
+
- Mae: 0.7812
|
24 |
+
- R2: 0.3990
|
25 |
+
- Pearsonr: 0.6383
|
26 |
+
- Spearmanr: 0.6272
|
27 |
+
|
28 |
+
## Model description
|
29 |
+
|
30 |
+
More information needed
|
31 |
+
|
32 |
+
## Intended uses & limitations
|
33 |
+
|
34 |
+
More information needed
|
35 |
+
|
36 |
+
## Training and evaluation data
|
37 |
+
|
38 |
+
More information needed
|
39 |
+
|
40 |
+
## Training procedure
|
41 |
+
|
42 |
+
### Training hyperparameters
|
43 |
+
|
44 |
+
The following hyperparameters were used during training:
|
45 |
+
- learning_rate: 1e-05
|
46 |
+
- train_batch_size: 8
|
47 |
+
- eval_batch_size: 8
|
48 |
+
- seed: 42
|
49 |
+
- gradient_accumulation_steps: 2
|
50 |
+
- total_train_batch_size: 16
|
51 |
+
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
52 |
+
- lr_scheduler_type: cosine_with_min_lr
|
53 |
+
- lr_scheduler_warmup_ratio: 0.1
|
54 |
+
- num_epochs: 5
|
55 |
+
|
56 |
+
### Training results
|
57 |
+
|
58 |
+
| Training Loss | Epoch | Step | Validation Loss | Mae | R2 | Pearsonr | Spearmanr |
|
59 |
+
|:-------------:|:------:|:----:|:---------------:|:------:|:-------:|:--------:|:---------:|
|
60 |
+
| 38.8642 | 0.0440 | 30 | 7.6612 | 2.1780 | -2.7511 | 0.0043 | -0.0027 |
|
61 |
+
| 10.4776 | 0.0880 | 60 | 2.3989 | 1.1914 | -0.1745 | 0.2921 | 0.2652 |
|
62 |
+
| 3.6459 | 0.1320 | 90 | 2.3098 | 1.2163 | -0.1310 | 0.4206 | 0.4163 |
|
63 |
+
| 5.807 | 0.1760 | 120 | 2.0340 | 1.1310 | 0.0041 | 0.4436 | 0.4267 |
|
64 |
+
| 5.2356 | 0.2199 | 150 | 1.9798 | 1.0620 | 0.0306 | 0.4974 | 0.4648 |
|
65 |
+
| 4.4142 | 0.2639 | 180 | 1.7722 | 1.0213 | 0.1323 | 0.5154 | 0.4874 |
|
66 |
+
| 4.9085 | 0.3079 | 210 | 6.8826 | 2.3366 | -2.3699 | 0.5232 | 0.5094 |
|
67 |
+
| 8.0726 | 0.3519 | 240 | 1.4623 | 0.9226 | 0.2840 | 0.5386 | 0.5242 |
|
68 |
+
| 29.2783 | 0.3959 | 270 | 4.0754 | 1.6163 | -0.9954 | 0.4770 | 0.4772 |
|
69 |
+
| 5.8973 | 0.4399 | 300 | 3.0100 | 1.3312 | -0.4738 | 0.5204 | 0.5089 |
|
70 |
+
| 3.3493 | 0.4839 | 330 | 1.4475 | 0.8710 | 0.2913 | 0.5574 | 0.5520 |
|
71 |
+
| 6.7682 | 0.5279 | 360 | 1.3851 | 0.8808 | 0.3218 | 0.5715 | 0.5572 |
|
72 |
+
| 4.3158 | 0.5718 | 390 | 2.2720 | 1.2907 | -0.1124 | 0.5504 | 0.5330 |
|
73 |
+
| 15.823 | 0.6158 | 420 | 4.1442 | 1.7054 | -1.0291 | 0.5797 | 0.5572 |
|
74 |
+
| 8.0344 | 0.6598 | 450 | 2.7629 | 1.3644 | -0.3528 | 0.5669 | 0.5553 |
|
75 |
+
| 3.171 | 0.7038 | 480 | 2.0582 | 1.1288 | -0.0078 | 0.5012 | 0.5405 |
|
76 |
+
| 6.7538 | 0.7478 | 510 | 1.6033 | 1.0240 | 0.2150 | 0.5934 | 0.5790 |
|
77 |
+
| 6.1151 | 0.7918 | 540 | 1.6594 | 1.0670 | 0.1875 | 0.5697 | 0.5401 |
|
78 |
+
| 2.5472 | 0.8358 | 570 | 1.7069 | 1.0674 | 0.1643 | 0.5920 | 0.5763 |
|
79 |
+
| 3.9392 | 0.8798 | 600 | 2.1113 | 1.2292 | -0.0337 | 0.5871 | 0.5767 |
|
80 |
+
| 3.9147 | 0.9238 | 630 | 1.3118 | 0.8620 | 0.3577 | 0.6115 | 0.5876 |
|
81 |
+
| 5.7769 | 0.9677 | 660 | 2.6878 | 1.2831 | -0.3160 | 0.5764 | 0.5612 |
|
82 |
+
| 5.0716 | 1.0117 | 690 | 2.1133 | 1.0984 | -0.0347 | 0.6059 | 0.5900 |
|
83 |
+
| 4.7065 | 1.0557 | 720 | 2.6758 | 1.3888 | -0.3101 | 0.6116 | 0.5974 |
|
84 |
+
| 1.6835 | 1.0997 | 750 | 1.2992 | 0.8625 | 0.3639 | 0.6090 | 0.5826 |
|
85 |
+
| 5.2112 | 1.1437 | 780 | 1.8851 | 1.1165 | 0.0770 | 0.5974 | 0.5784 |
|
86 |
+
| 2.7997 | 1.1877 | 810 | 1.4227 | 0.9234 | 0.3034 | 0.6077 | 0.5810 |
|
87 |
+
| 1.9417 | 1.2317 | 840 | 1.5027 | 0.9326 | 0.2642 | 0.6310 | 0.6065 |
|
88 |
+
| 2.8662 | 1.2757 | 870 | 1.3368 | 0.8925 | 0.3454 | 0.6140 | 0.5774 |
|
89 |
+
| 4.2357 | 1.3196 | 900 | 2.6313 | 1.4141 | -0.2883 | 0.6385 | 0.6103 |
|
90 |
+
| 7.8053 | 1.3636 | 930 | 1.6020 | 0.9218 | 0.2156 | 0.6347 | 0.6080 |
|
91 |
+
| 1.1231 | 1.4076 | 960 | 1.4656 | 0.9488 | 0.2824 | 0.6385 | 0.6122 |
|
92 |
+
| 5.6334 | 1.4516 | 990 | 1.3516 | 0.9137 | 0.3382 | 0.6426 | 0.6221 |
|
93 |
+
| 4.371 | 1.4956 | 1020 | 2.6421 | 1.4260 | -0.2937 | 0.6369 | 0.6152 |
|
94 |
+
| 3.9286 | 1.5396 | 1050 | 1.4988 | 0.9515 | 0.2661 | 0.6398 | 0.6191 |
|
95 |
+
| 2.2357 | 1.5836 | 1080 | 1.3611 | 0.9070 | 0.3336 | 0.6290 | 0.6100 |
|
96 |
+
| 7.9489 | 1.6276 | 1110 | 1.2121 | 0.8059 | 0.4065 | 0.6418 | 0.6175 |
|
97 |
+
| 6.065 | 1.6716 | 1140 | 1.2714 | 0.8813 | 0.3775 | 0.6513 | 0.6241 |
|
98 |
+
| 2.1338 | 1.7155 | 1170 | 1.2413 | 0.8370 | 0.3922 | 0.6338 | 0.6065 |
|
99 |
+
| 2.5689 | 1.7595 | 1200 | 1.7681 | 1.0914 | 0.1343 | 0.6437 | 0.6228 |
|
100 |
+
| 1.4487 | 1.8035 | 1230 | 1.9605 | 1.1252 | 0.0401 | 0.6136 | 0.5836 |
|
101 |
+
| 2.2018 | 1.8475 | 1260 | 2.9671 | 1.5227 | -0.4528 | 0.6329 | 0.6100 |
|
102 |
+
| 2.8964 | 1.8915 | 1290 | 1.6779 | 1.0542 | 0.1784 | 0.6384 | 0.6163 |
|
103 |
+
| 2.1872 | 1.9355 | 1320 | 1.2393 | 0.8072 | 0.3932 | 0.6459 | 0.6272 |
|
104 |
+
| 3.2919 | 1.9795 | 1350 | 2.7018 | 1.4239 | -0.3229 | 0.6401 | 0.6227 |
|
105 |
+
| 2.5316 | 2.0235 | 1380 | 1.3240 | 0.8902 | 0.3517 | 0.6484 | 0.6285 |
|
106 |
+
| 2.0354 | 2.0674 | 1410 | 1.4146 | 0.9048 | 0.3074 | 0.6344 | 0.6130 |
|
107 |
+
| 2.9549 | 2.1114 | 1440 | 1.2957 | 0.8381 | 0.3656 | 0.6393 | 0.6228 |
|
108 |
+
| 3.5482 | 2.1554 | 1470 | 1.2744 | 0.8478 | 0.3760 | 0.6287 | 0.6077 |
|
109 |
+
| 2.3728 | 2.1994 | 1500 | 1.6528 | 1.0318 | 0.1907 | 0.6351 | 0.6166 |
|
110 |
+
| 2.9036 | 2.2434 | 1530 | 1.6116 | 1.0098 | 0.2109 | 0.6387 | 0.6141 |
|
111 |
+
| 2.4741 | 2.2874 | 1560 | 1.5921 | 1.0000 | 0.2204 | 0.6528 | 0.6346 |
|
112 |
+
| 1.3401 | 2.3314 | 1590 | 1.2849 | 0.8326 | 0.3709 | 0.6425 | 0.6294 |
|
113 |
+
| 2.1981 | 2.3754 | 1620 | 2.0894 | 1.1972 | -0.0230 | 0.6428 | 0.6306 |
|
114 |
+
| 3.6077 | 2.4194 | 1650 | 1.2730 | 0.8461 | 0.3767 | 0.6411 | 0.6263 |
|
115 |
+
| 1.2494 | 2.4633 | 1680 | 1.3331 | 0.8805 | 0.3473 | 0.6520 | 0.6388 |
|
116 |
+
| 1.6448 | 2.5073 | 1710 | 1.8776 | 1.1258 | 0.0807 | 0.6539 | 0.6358 |
|
117 |
+
| 1.6004 | 2.5513 | 1740 | 1.6464 | 1.0332 | 0.1939 | 0.6457 | 0.6231 |
|
118 |
+
| 2.6825 | 2.5953 | 1770 | 1.2436 | 0.8325 | 0.3911 | 0.6517 | 0.6305 |
|
119 |
+
| 4.1015 | 2.6393 | 1800 | 1.8048 | 1.1235 | 0.1163 | 0.6490 | 0.6281 |
|
120 |
+
| 2.3947 | 2.6833 | 1830 | 2.1353 | 1.2060 | -0.0455 | 0.6513 | 0.6283 |
|
121 |
+
| 3.6517 | 2.7273 | 1860 | 2.2012 | 1.2143 | -0.0778 | 0.6511 | 0.6259 |
|
122 |
+
| 1.283 | 2.7713 | 1890 | 1.4102 | 0.9454 | 0.3095 | 0.6475 | 0.6209 |
|
123 |
+
| 3.372 | 2.8152 | 1920 | 1.2497 | 0.8385 | 0.3881 | 0.6544 | 0.6310 |
|
124 |
+
| 0.9015 | 2.8592 | 1950 | 1.5059 | 0.9694 | 0.2627 | 0.6439 | 0.6275 |
|
125 |
+
| 2.1263 | 2.9032 | 1980 | 1.2574 | 0.8277 | 0.3844 | 0.6561 | 0.6392 |
|
126 |
+
| 1.7678 | 2.9472 | 2010 | 1.2511 | 0.8340 | 0.3874 | 0.6547 | 0.6378 |
|
127 |
+
| 0.8637 | 2.9912 | 2040 | 1.3555 | 0.8935 | 0.3363 | 0.6452 | 0.6275 |
|
128 |
+
| 1.1866 | 3.0352 | 2070 | 1.2389 | 0.8230 | 0.3934 | 0.6519 | 0.6355 |
|
129 |
+
| 1.521 | 3.0792 | 2100 | 1.3950 | 0.9128 | 0.3170 | 0.6416 | 0.6268 |
|
130 |
+
| 1.3431 | 3.1232 | 2130 | 1.3883 | 0.9162 | 0.3203 | 0.6406 | 0.6282 |
|
131 |
+
| 1.6443 | 3.1672 | 2160 | 1.2446 | 0.8213 | 0.3906 | 0.6430 | 0.6284 |
|
132 |
+
| 2.2007 | 3.2111 | 2190 | 1.4758 | 0.9392 | 0.2774 | 0.6456 | 0.6316 |
|
133 |
+
| 1.24 | 3.2551 | 2220 | 1.5468 | 0.9892 | 0.2426 | 0.6458 | 0.6308 |
|
134 |
+
| 0.7113 | 3.2991 | 2250 | 1.2618 | 0.8316 | 0.3822 | 0.6454 | 0.6275 |
|
135 |
+
| 1.9999 | 3.3431 | 2280 | 1.6327 | 1.0221 | 0.2006 | 0.6493 | 0.6304 |
|
136 |
+
| 0.4573 | 3.3871 | 2310 | 1.2183 | 0.8150 | 0.4035 | 0.6497 | 0.6301 |
|
137 |
+
| 0.1997 | 3.4311 | 2340 | 1.2584 | 0.8476 | 0.3838 | 0.6401 | 0.6219 |
|
138 |
+
| 0.6893 | 3.4751 | 2370 | 1.3907 | 0.9077 | 0.3191 | 0.6507 | 0.6344 |
|
139 |
+
| 2.5815 | 3.5191 | 2400 | 1.5668 | 0.9990 | 0.2329 | 0.6503 | 0.6342 |
|
140 |
+
| 0.5047 | 3.5630 | 2430 | 1.2605 | 0.8514 | 0.3828 | 0.6490 | 0.6313 |
|
141 |
+
| 0.6636 | 3.6070 | 2460 | 1.4618 | 0.9461 | 0.2843 | 0.6492 | 0.6363 |
|
142 |
+
| 0.6637 | 3.6510 | 2490 | 1.4765 | 0.9607 | 0.2770 | 0.6476 | 0.6356 |
|
143 |
+
| 0.9363 | 3.6950 | 2520 | 1.2501 | 0.8259 | 0.3879 | 0.6498 | 0.6337 |
|
144 |
+
| 0.7925 | 3.7390 | 2550 | 1.3660 | 0.8958 | 0.3312 | 0.6462 | 0.6318 |
|
145 |
+
| 1.8824 | 3.7830 | 2580 | 1.3078 | 0.8686 | 0.3597 | 0.6446 | 0.6312 |
|
146 |
+
| 1.4881 | 3.8270 | 2610 | 1.6678 | 1.0378 | 0.1834 | 0.6427 | 0.6292 |
|
147 |
+
| 1.2663 | 3.8710 | 2640 | 2.0540 | 1.1969 | -0.0057 | 0.6404 | 0.6242 |
|
148 |
+
| 0.9128 | 3.9150 | 2670 | 1.2595 | 0.8179 | 0.3833 | 0.6438 | 0.6273 |
|
149 |
+
| 1.3534 | 3.9589 | 2700 | 1.3228 | 0.8648 | 0.3523 | 0.6383 | 0.6224 |
|
150 |
+
| 0.3248 | 4.0029 | 2730 | 1.6017 | 0.9971 | 0.2157 | 0.6424 | 0.6260 |
|
151 |
+
| 0.4408 | 4.0469 | 2760 | 1.2523 | 0.8347 | 0.3868 | 0.6474 | 0.6290 |
|
152 |
+
| 0.6593 | 4.0909 | 2790 | 1.2593 | 0.8396 | 0.3834 | 0.6453 | 0.6277 |
|
153 |
+
| 0.5935 | 4.1349 | 2820 | 1.3069 | 0.8725 | 0.3601 | 0.6438 | 0.6277 |
|
154 |
+
| 0.5308 | 4.1789 | 2850 | 1.2745 | 0.8521 | 0.3760 | 0.6449 | 0.6290 |
|
155 |
+
| 0.94 | 4.2229 | 2880 | 1.3047 | 0.8737 | 0.3612 | 0.6448 | 0.6289 |
|
156 |
+
| 0.6516 | 4.2669 | 2910 | 1.4950 | 0.9587 | 0.2680 | 0.6452 | 0.6315 |
|
157 |
+
| 0.1789 | 4.3109 | 2940 | 1.3578 | 0.8991 | 0.3352 | 0.6453 | 0.6288 |
|
158 |
+
| 0.5594 | 4.3548 | 2970 | 1.4207 | 0.9304 | 0.3044 | 0.6458 | 0.6298 |
|
159 |
+
| 0.3357 | 4.3988 | 3000 | 1.5353 | 0.9849 | 0.2483 | 0.6452 | 0.6282 |
|
160 |
+
| 0.1883 | 4.4428 | 3030 | 1.4177 | 0.9274 | 0.3059 | 0.6483 | 0.6326 |
|
161 |
+
| 0.3584 | 4.4868 | 3060 | 1.3492 | 0.8908 | 0.3394 | 0.6498 | 0.6348 |
|
162 |
+
| 0.51 | 4.5308 | 3090 | 1.3724 | 0.9032 | 0.3280 | 0.6479 | 0.6324 |
|
163 |
+
| 0.2909 | 4.5748 | 3120 | 1.3617 | 0.8998 | 0.3333 | 0.6460 | 0.6302 |
|
164 |
+
| 0.4247 | 4.6188 | 3150 | 1.3533 | 0.8985 | 0.3374 | 0.6485 | 0.6334 |
|
165 |
+
| 0.5367 | 4.6628 | 3180 | 1.3397 | 0.8856 | 0.3441 | 0.6456 | 0.6312 |
|
166 |
+
| 0.4184 | 4.7067 | 3210 | 1.3487 | 0.8928 | 0.3396 | 0.6458 | 0.6306 |
|
167 |
+
| 0.2521 | 4.7507 | 3240 | 1.3022 | 0.8580 | 0.3624 | 0.6462 | 0.6307 |
|
168 |
+
| 0.2434 | 4.7947 | 3270 | 1.5001 | 0.9638 | 0.2655 | 0.6450 | 0.6305 |
|
169 |
+
| 0.2547 | 4.8387 | 3300 | 1.3812 | 0.9053 | 0.3237 | 0.6452 | 0.6300 |
|
170 |
+
| 0.9901 | 4.8827 | 3330 | 1.4053 | 0.9147 | 0.3119 | 0.6449 | 0.6292 |
|
171 |
+
| 0.1669 | 4.9267 | 3360 | 1.3150 | 0.8729 | 0.3561 | 0.6473 | 0.6319 |
|
172 |
+
| 0.3208 | 4.9707 | 3390 | 1.2870 | 0.8580 | 0.3698 | 0.6495 | 0.6346 |
|
173 |
+
|
174 |
+
|
175 |
+
### Framework versions
|
176 |
+
|
177 |
+
- Transformers 4.49.0
|
178 |
+
- Pytorch 2.4.1+cu124
|
179 |
+
- Datasets 3.3.2
|
180 |
+
- Tokenizers 0.21.0
|
config.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "sbintuitions/modernbert-ja-310m",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "cls",
|
13 |
+
"cls_token_id": 6,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 2,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"id2label": {
|
24 |
+
"0": "LABEL_0"
|
25 |
+
},
|
26 |
+
"initializer_cutoff_factor": 2.0,
|
27 |
+
"initializer_range": 0.02,
|
28 |
+
"intermediate_size": 3072,
|
29 |
+
"label2id": {
|
30 |
+
"LABEL_0": 0
|
31 |
+
},
|
32 |
+
"layer_norm_eps": 1e-05,
|
33 |
+
"local_attention": 128,
|
34 |
+
"local_rope_theta": 10000.0,
|
35 |
+
"max_position_embeddings": 8192,
|
36 |
+
"mlp_bias": false,
|
37 |
+
"mlp_dropout": 0.0,
|
38 |
+
"model_type": "modernbert",
|
39 |
+
"norm_bias": false,
|
40 |
+
"norm_eps": 1e-05,
|
41 |
+
"num_attention_heads": 12,
|
42 |
+
"num_hidden_layers": 25,
|
43 |
+
"pad_token_id": 3,
|
44 |
+
"position_embedding_type": "rope",
|
45 |
+
"problem_type": "regression",
|
46 |
+
"reference_compile": false,
|
47 |
+
"repad_logits_with_grad": false,
|
48 |
+
"sep_token_id": 4,
|
49 |
+
"sparse_pred_ignore_index": -100,
|
50 |
+
"sparse_prediction": false,
|
51 |
+
"torch_dtype": "float32",
|
52 |
+
"transformers_version": "4.49.0",
|
53 |
+
"vocab_size": 102400
|
54 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd76f4f7167b35737fc2d0d6cccf011045201ccac5bb44cac094ec6235b1b5bb
|
3 |
+
size 1260829436
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<cls>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "<sep>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:008293028e1a9d9a1038d9b63d989a2319797dfeaa03f171093a57b33a3a8277
|
3 |
+
size 1831879
|
tokenizer_config.json
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_dummy_prefix_space": false,
|
4 |
+
"add_eos_token": true,
|
5 |
+
"add_prefix_space": false,
|
6 |
+
"added_tokens_decoder": {
|
7 |
+
"0": {
|
8 |
+
"content": "<unk>",
|
9 |
+
"lstrip": false,
|
10 |
+
"normalized": false,
|
11 |
+
"rstrip": false,
|
12 |
+
"single_word": false,
|
13 |
+
"special": true
|
14 |
+
},
|
15 |
+
"1": {
|
16 |
+
"content": "<s>",
|
17 |
+
"lstrip": false,
|
18 |
+
"normalized": false,
|
19 |
+
"rstrip": false,
|
20 |
+
"single_word": false,
|
21 |
+
"special": true
|
22 |
+
},
|
23 |
+
"2": {
|
24 |
+
"content": "</s>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false,
|
29 |
+
"special": true
|
30 |
+
},
|
31 |
+
"3": {
|
32 |
+
"content": "<pad>",
|
33 |
+
"lstrip": false,
|
34 |
+
"normalized": false,
|
35 |
+
"rstrip": false,
|
36 |
+
"single_word": false,
|
37 |
+
"special": true
|
38 |
+
},
|
39 |
+
"4": {
|
40 |
+
"content": "<sep>",
|
41 |
+
"lstrip": false,
|
42 |
+
"normalized": false,
|
43 |
+
"rstrip": false,
|
44 |
+
"single_word": false,
|
45 |
+
"special": true
|
46 |
+
},
|
47 |
+
"5": {
|
48 |
+
"content": "<mask>",
|
49 |
+
"lstrip": false,
|
50 |
+
"normalized": false,
|
51 |
+
"rstrip": false,
|
52 |
+
"single_word": false,
|
53 |
+
"special": true
|
54 |
+
},
|
55 |
+
"6": {
|
56 |
+
"content": "<cls>",
|
57 |
+
"lstrip": false,
|
58 |
+
"normalized": false,
|
59 |
+
"rstrip": false,
|
60 |
+
"single_word": false,
|
61 |
+
"special": true
|
62 |
+
},
|
63 |
+
"7": {
|
64 |
+
"content": "<|system|>",
|
65 |
+
"lstrip": false,
|
66 |
+
"normalized": false,
|
67 |
+
"rstrip": false,
|
68 |
+
"single_word": false,
|
69 |
+
"special": false
|
70 |
+
},
|
71 |
+
"8": {
|
72 |
+
"content": "<|assistant|>",
|
73 |
+
"lstrip": false,
|
74 |
+
"normalized": false,
|
75 |
+
"rstrip": false,
|
76 |
+
"single_word": false,
|
77 |
+
"special": false
|
78 |
+
},
|
79 |
+
"9": {
|
80 |
+
"content": "<|user|>",
|
81 |
+
"lstrip": false,
|
82 |
+
"normalized": false,
|
83 |
+
"rstrip": false,
|
84 |
+
"single_word": false,
|
85 |
+
"special": false
|
86 |
+
},
|
87 |
+
"10": {
|
88 |
+
"content": "<|available_tools|>",
|
89 |
+
"lstrip": false,
|
90 |
+
"normalized": false,
|
91 |
+
"rstrip": false,
|
92 |
+
"single_word": false,
|
93 |
+
"special": false
|
94 |
+
},
|
95 |
+
"11": {
|
96 |
+
"content": "<|tool_calls|>",
|
97 |
+
"lstrip": false,
|
98 |
+
"normalized": false,
|
99 |
+
"rstrip": false,
|
100 |
+
"single_word": false,
|
101 |
+
"special": false
|
102 |
+
},
|
103 |
+
"12": {
|
104 |
+
"content": "<|tool_results|>",
|
105 |
+
"lstrip": false,
|
106 |
+
"normalized": false,
|
107 |
+
"rstrip": false,
|
108 |
+
"single_word": false,
|
109 |
+
"special": false
|
110 |
+
},
|
111 |
+
"13": {
|
112 |
+
"content": "<|code|>",
|
113 |
+
"lstrip": false,
|
114 |
+
"normalized": false,
|
115 |
+
"rstrip": false,
|
116 |
+
"single_word": false,
|
117 |
+
"special": false
|
118 |
+
},
|
119 |
+
"14": {
|
120 |
+
"content": "<|file|>",
|
121 |
+
"lstrip": false,
|
122 |
+
"normalized": false,
|
123 |
+
"rstrip": false,
|
124 |
+
"single_word": false,
|
125 |
+
"special": false
|
126 |
+
},
|
127 |
+
"102397": {
|
128 |
+
"content": "<|prefix|>",
|
129 |
+
"lstrip": false,
|
130 |
+
"normalized": false,
|
131 |
+
"rstrip": false,
|
132 |
+
"single_word": false,
|
133 |
+
"special": false
|
134 |
+
},
|
135 |
+
"102398": {
|
136 |
+
"content": "<|suffix|>",
|
137 |
+
"lstrip": false,
|
138 |
+
"normalized": false,
|
139 |
+
"rstrip": false,
|
140 |
+
"single_word": false,
|
141 |
+
"special": false
|
142 |
+
},
|
143 |
+
"102399": {
|
144 |
+
"content": "<|middle|>",
|
145 |
+
"lstrip": false,
|
146 |
+
"normalized": false,
|
147 |
+
"rstrip": false,
|
148 |
+
"single_word": false,
|
149 |
+
"special": false
|
150 |
+
}
|
151 |
+
},
|
152 |
+
"bos_token": "<s>",
|
153 |
+
"clean_up_tokenization_spaces": false,
|
154 |
+
"cls_token": "<cls>",
|
155 |
+
"do_lower_case": false,
|
156 |
+
"eos_token": "</s>",
|
157 |
+
"extra_ids": 0,
|
158 |
+
"extra_special_tokens": {},
|
159 |
+
"keep_accents": true,
|
160 |
+
"legacy": false,
|
161 |
+
"mask_token": "<mask>",
|
162 |
+
"model_max_length": 8192,
|
163 |
+
"pad_token": "<pad>",
|
164 |
+
"padding_side": "right",
|
165 |
+
"sep_token": "<sep>",
|
166 |
+
"sp_model_kwargs": {},
|
167 |
+
"spaces_between_special_tokens": false,
|
168 |
+
"tokenizer_class": "LlamaTokenizer",
|
169 |
+
"unk_token": "<unk>",
|
170 |
+
"use_default_system_prompt": false
|
171 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0cda7fe8d4e3571c0787229e9ffe72e78b8441aee9250ed55040f3b46cfae637
|
3 |
+
size 5432
|