Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round10.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round12.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round15.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round17.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round2.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round20.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round5.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round7.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_trainer_state.json +378 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round10.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round12.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round15.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round17.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round2.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round20.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round5.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round7.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_trainer_state.json +378 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round10.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round12.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round15.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round17.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round2.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round20.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round5.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round7.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_trainer_state.json +378 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round10.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round12.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round15.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round17.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round2.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round20.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round5.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round7.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_trainer_state.json +378 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round10.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round12.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round15.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round17.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round2.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round20.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round5.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round7.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_trainer_state.json +378 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round10.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round12.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round15.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round17.pth +3 -0
- client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round2.pth +3 -0
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round10.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35db35281882d1f226fb057ce1dc1d10268ce76953bee3787606804b39d316ef
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round12.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4320beb5e2c354434681ed2d3b2dcdd78d536aedc3fba5f9f1e06943ec2e7c9
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round15.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8028b19b5f86ee47adabeb517b0cb8e6f55250d34905080c06d2df87bc90e326
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:399ebbb40d73ecda84215e6074344b500ffade76f87bfea8a0d6d62346a108d5
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a73b669622db0d158ac1056f7ed6b435324a3c4b5f24e9a3bde9565770f36e5
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round20.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4562188f7b74a13d10b41b89b9eddab396cfb79214a4849a8637db03c488f361
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c21d1d0f631ea5682fb5747b169d1f49dea3a6af343afe5d51396cda5580892
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_client_model_round7.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ddd22f5b455184254d90300e38de573ee5573bc8ffd8cb32135fe7f168557e7b
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/0_trainer_state.json
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 97,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.020618556701030927,
|
13 |
+
"grad_norm": 4.899675369262695,
|
14 |
+
"learning_rate": 2e-05,
|
15 |
+
"loss": 0.4572,
|
16 |
+
"step": 2
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.041237113402061855,
|
20 |
+
"grad_norm": 0.364077627658844,
|
21 |
+
"learning_rate": 2e-05,
|
22 |
+
"loss": 0.1723,
|
23 |
+
"step": 4
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.061855670103092786,
|
27 |
+
"grad_norm": 1.9865665435791016,
|
28 |
+
"learning_rate": 2e-05,
|
29 |
+
"loss": 0.6005,
|
30 |
+
"step": 6
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.08247422680412371,
|
34 |
+
"grad_norm": 4.352550029754639,
|
35 |
+
"learning_rate": 2e-05,
|
36 |
+
"loss": 1.0405,
|
37 |
+
"step": 8
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.10309278350515463,
|
41 |
+
"grad_norm": 1.1260665655136108,
|
42 |
+
"learning_rate": 2e-05,
|
43 |
+
"loss": 0.2097,
|
44 |
+
"step": 10
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.12371134020618557,
|
48 |
+
"grad_norm": 4.107581615447998,
|
49 |
+
"learning_rate": 2e-05,
|
50 |
+
"loss": 1.3789,
|
51 |
+
"step": 12
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.14432989690721648,
|
55 |
+
"grad_norm": 0.47090861201286316,
|
56 |
+
"learning_rate": 2e-05,
|
57 |
+
"loss": 0.381,
|
58 |
+
"step": 14
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.16494845360824742,
|
62 |
+
"grad_norm": 0.8781032562255859,
|
63 |
+
"learning_rate": 2e-05,
|
64 |
+
"loss": 0.1236,
|
65 |
+
"step": 16
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.18556701030927836,
|
69 |
+
"grad_norm": 8.501681327819824,
|
70 |
+
"learning_rate": 2e-05,
|
71 |
+
"loss": 1.1681,
|
72 |
+
"step": 18
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.20618556701030927,
|
76 |
+
"grad_norm": 1.3251969814300537,
|
77 |
+
"learning_rate": 2e-05,
|
78 |
+
"loss": 0.6099,
|
79 |
+
"step": 20
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.2268041237113402,
|
83 |
+
"grad_norm": 3.748138189315796,
|
84 |
+
"learning_rate": 2e-05,
|
85 |
+
"loss": 1.0833,
|
86 |
+
"step": 22
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.24742268041237114,
|
90 |
+
"grad_norm": 0.729949414730072,
|
91 |
+
"learning_rate": 2e-05,
|
92 |
+
"loss": 0.1654,
|
93 |
+
"step": 24
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.26804123711340205,
|
97 |
+
"grad_norm": 1.2475837469100952,
|
98 |
+
"learning_rate": 2e-05,
|
99 |
+
"loss": 0.4468,
|
100 |
+
"step": 26
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.28865979381443296,
|
104 |
+
"grad_norm": 0.3473127484321594,
|
105 |
+
"learning_rate": 2e-05,
|
106 |
+
"loss": 0.2435,
|
107 |
+
"step": 28
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.30927835051546393,
|
111 |
+
"grad_norm": 4.668135166168213,
|
112 |
+
"learning_rate": 2e-05,
|
113 |
+
"loss": 1.338,
|
114 |
+
"step": 30
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.32989690721649484,
|
118 |
+
"grad_norm": 2.5174388885498047,
|
119 |
+
"learning_rate": 2e-05,
|
120 |
+
"loss": 0.2791,
|
121 |
+
"step": 32
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.35051546391752575,
|
125 |
+
"grad_norm": 2.1460764408111572,
|
126 |
+
"learning_rate": 2e-05,
|
127 |
+
"loss": 0.4064,
|
128 |
+
"step": 34
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.3711340206185567,
|
132 |
+
"grad_norm": 3.378509998321533,
|
133 |
+
"learning_rate": 2e-05,
|
134 |
+
"loss": 0.7993,
|
135 |
+
"step": 36
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3917525773195876,
|
139 |
+
"grad_norm": 1.6487655639648438,
|
140 |
+
"learning_rate": 2e-05,
|
141 |
+
"loss": 0.1094,
|
142 |
+
"step": 38
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.41237113402061853,
|
146 |
+
"grad_norm": 2.8206214904785156,
|
147 |
+
"learning_rate": 2e-05,
|
148 |
+
"loss": 1.4364,
|
149 |
+
"step": 40
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.4329896907216495,
|
153 |
+
"grad_norm": 2.229614496231079,
|
154 |
+
"learning_rate": 2e-05,
|
155 |
+
"loss": 0.3985,
|
156 |
+
"step": 42
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.4536082474226804,
|
160 |
+
"grad_norm": 1.4115321636199951,
|
161 |
+
"learning_rate": 2e-05,
|
162 |
+
"loss": 0.1919,
|
163 |
+
"step": 44
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.4742268041237113,
|
167 |
+
"grad_norm": 2.823225498199463,
|
168 |
+
"learning_rate": 2e-05,
|
169 |
+
"loss": 0.3599,
|
170 |
+
"step": 46
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.4948453608247423,
|
174 |
+
"grad_norm": 0.7314802408218384,
|
175 |
+
"learning_rate": 2e-05,
|
176 |
+
"loss": 0.9801,
|
177 |
+
"step": 48
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.5154639175257731,
|
181 |
+
"grad_norm": 1.0195244550704956,
|
182 |
+
"learning_rate": 2e-05,
|
183 |
+
"loss": 0.0946,
|
184 |
+
"step": 50
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.5360824742268041,
|
188 |
+
"grad_norm": 2.806351661682129,
|
189 |
+
"learning_rate": 2e-05,
|
190 |
+
"loss": 0.3121,
|
191 |
+
"step": 52
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.5567010309278351,
|
195 |
+
"grad_norm": 0.7020225524902344,
|
196 |
+
"learning_rate": 2e-05,
|
197 |
+
"loss": 1.0508,
|
198 |
+
"step": 54
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.5773195876288659,
|
202 |
+
"grad_norm": 1.1141129732131958,
|
203 |
+
"learning_rate": 2e-05,
|
204 |
+
"loss": 0.1508,
|
205 |
+
"step": 56
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.5979381443298969,
|
209 |
+
"grad_norm": 2.0665926933288574,
|
210 |
+
"learning_rate": 2e-05,
|
211 |
+
"loss": 0.1938,
|
212 |
+
"step": 58
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.6185567010309279,
|
216 |
+
"grad_norm": 0.09458617866039276,
|
217 |
+
"learning_rate": 2e-05,
|
218 |
+
"loss": 0.5867,
|
219 |
+
"step": 60
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.6391752577319587,
|
223 |
+
"grad_norm": 1.0511982440948486,
|
224 |
+
"learning_rate": 2e-05,
|
225 |
+
"loss": 2.7297,
|
226 |
+
"step": 62
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.6597938144329897,
|
230 |
+
"grad_norm": 8.42614459991455,
|
231 |
+
"learning_rate": 2e-05,
|
232 |
+
"loss": 1.3084,
|
233 |
+
"step": 64
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.6804123711340206,
|
237 |
+
"grad_norm": 4.046963691711426,
|
238 |
+
"learning_rate": 2e-05,
|
239 |
+
"loss": 0.8881,
|
240 |
+
"step": 66
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.7010309278350515,
|
244 |
+
"grad_norm": 2.4904868602752686,
|
245 |
+
"learning_rate": 2e-05,
|
246 |
+
"loss": 0.3384,
|
247 |
+
"step": 68
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.7216494845360825,
|
251 |
+
"grad_norm": 1.1527413129806519,
|
252 |
+
"learning_rate": 2e-05,
|
253 |
+
"loss": 0.3475,
|
254 |
+
"step": 70
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.7422680412371134,
|
258 |
+
"grad_norm": 1.3058439493179321,
|
259 |
+
"learning_rate": 2e-05,
|
260 |
+
"loss": 0.5549,
|
261 |
+
"step": 72
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.7628865979381443,
|
265 |
+
"grad_norm": 0.345022976398468,
|
266 |
+
"learning_rate": 2e-05,
|
267 |
+
"loss": 0.869,
|
268 |
+
"step": 74
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.7835051546391752,
|
272 |
+
"grad_norm": 0.7008552551269531,
|
273 |
+
"learning_rate": 2e-05,
|
274 |
+
"loss": 0.2608,
|
275 |
+
"step": 76
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.8041237113402062,
|
279 |
+
"grad_norm": 3.1025912761688232,
|
280 |
+
"learning_rate": 2e-05,
|
281 |
+
"loss": 0.6226,
|
282 |
+
"step": 78
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.8247422680412371,
|
286 |
+
"grad_norm": 3.710395336151123,
|
287 |
+
"learning_rate": 2e-05,
|
288 |
+
"loss": 0.9768,
|
289 |
+
"step": 80
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.845360824742268,
|
293 |
+
"grad_norm": 0.3900620937347412,
|
294 |
+
"learning_rate": 2e-05,
|
295 |
+
"loss": 0.1941,
|
296 |
+
"step": 82
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.865979381443299,
|
300 |
+
"grad_norm": 0.23837120831012726,
|
301 |
+
"learning_rate": 2e-05,
|
302 |
+
"loss": 0.2005,
|
303 |
+
"step": 84
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.8865979381443299,
|
307 |
+
"grad_norm": 5.000380516052246,
|
308 |
+
"learning_rate": 2e-05,
|
309 |
+
"loss": 0.593,
|
310 |
+
"step": 86
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.9072164948453608,
|
314 |
+
"grad_norm": 2.397953748703003,
|
315 |
+
"learning_rate": 2e-05,
|
316 |
+
"loss": 1.7147,
|
317 |
+
"step": 88
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.9278350515463918,
|
321 |
+
"grad_norm": 0.20643925666809082,
|
322 |
+
"learning_rate": 2e-05,
|
323 |
+
"loss": 0.0407,
|
324 |
+
"step": 90
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.9484536082474226,
|
328 |
+
"grad_norm": 3.6777753829956055,
|
329 |
+
"learning_rate": 2e-05,
|
330 |
+
"loss": 0.9327,
|
331 |
+
"step": 92
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.9690721649484536,
|
335 |
+
"grad_norm": 1.6937133073806763,
|
336 |
+
"learning_rate": 2e-05,
|
337 |
+
"loss": 0.1758,
|
338 |
+
"step": 94
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.9896907216494846,
|
342 |
+
"grad_norm": 1.2445141077041626,
|
343 |
+
"learning_rate": 2e-05,
|
344 |
+
"loss": 1.7757,
|
345 |
+
"step": 96
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 1.0,
|
349 |
+
"step": 97,
|
350 |
+
"total_flos": 2126123943067648.0,
|
351 |
+
"train_loss": 0.6465311345365858,
|
352 |
+
"train_runtime": 271.5469,
|
353 |
+
"train_samples_per_second": 1.429,
|
354 |
+
"train_steps_per_second": 0.357
|
355 |
+
}
|
356 |
+
],
|
357 |
+
"logging_steps": 2,
|
358 |
+
"max_steps": 97,
|
359 |
+
"num_input_tokens_seen": 0,
|
360 |
+
"num_train_epochs": 1,
|
361 |
+
"save_steps": 500,
|
362 |
+
"stateful_callbacks": {
|
363 |
+
"TrainerControl": {
|
364 |
+
"args": {
|
365 |
+
"should_epoch_stop": false,
|
366 |
+
"should_evaluate": false,
|
367 |
+
"should_log": false,
|
368 |
+
"should_save": false,
|
369 |
+
"should_training_stop": false
|
370 |
+
},
|
371 |
+
"attributes": {}
|
372 |
+
}
|
373 |
+
},
|
374 |
+
"total_flos": 2126123943067648.0,
|
375 |
+
"train_batch_size": 1,
|
376 |
+
"trial_name": null,
|
377 |
+
"trial_params": null
|
378 |
+
}
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round10.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c8dec18e9ef0ae448deccaea823d6e0a2cca1306d431c07c7038536434f2756
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round12.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:004ea8c34b31af667eb1fb2bdd883169fb9ff673b0cc75384faaf1f7c3cb71bb
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round15.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec5da721e1b6c599c6c777932e40e1686f2e690d65f61fafe38ca32a3ce82f4c
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bed02013c2ad5dc7d708c4242ec2ee481920b2c93c7dc9031ed9e9511ea59c47
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bdb1c926f6643fd1fd6146b0365d0a68c6ec0ac9c150cc3d0e260b305cb7db58
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round20.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e0131f3ad1b687e7f496607cd4e14c1663182a9d79a2923b60a4aa694467d4f
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fc25cbb650a8a7622723b96c175ad75da07b0f5f4eceaf959431699826cf152
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_client_model_round7.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b064918db2915e992ca655eb974f163135c2d01f6c4cb93d91da3ddb56a3189
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/1_trainer_state.json
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 97,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.020618556701030927,
|
13 |
+
"grad_norm": 0.0385405458509922,
|
14 |
+
"learning_rate": 2e-05,
|
15 |
+
"loss": 0.0186,
|
16 |
+
"step": 2
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.041237113402061855,
|
20 |
+
"grad_norm": 0.06613821536302567,
|
21 |
+
"learning_rate": 2e-05,
|
22 |
+
"loss": 0.0849,
|
23 |
+
"step": 4
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.061855670103092786,
|
27 |
+
"grad_norm": 0.02067434787750244,
|
28 |
+
"learning_rate": 2e-05,
|
29 |
+
"loss": 0.0054,
|
30 |
+
"step": 6
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.08247422680412371,
|
34 |
+
"grad_norm": 0.06695201992988586,
|
35 |
+
"learning_rate": 2e-05,
|
36 |
+
"loss": 0.0057,
|
37 |
+
"step": 8
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.10309278350515463,
|
41 |
+
"grad_norm": 0.15570124983787537,
|
42 |
+
"learning_rate": 2e-05,
|
43 |
+
"loss": 0.0129,
|
44 |
+
"step": 10
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.12371134020618557,
|
48 |
+
"grad_norm": 0.05093451589345932,
|
49 |
+
"learning_rate": 2e-05,
|
50 |
+
"loss": 0.0056,
|
51 |
+
"step": 12
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.14432989690721648,
|
55 |
+
"grad_norm": 0.023744968697428703,
|
56 |
+
"learning_rate": 2e-05,
|
57 |
+
"loss": 0.0124,
|
58 |
+
"step": 14
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.16494845360824742,
|
62 |
+
"grad_norm": 0.00312516069971025,
|
63 |
+
"learning_rate": 2e-05,
|
64 |
+
"loss": 0.0054,
|
65 |
+
"step": 16
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.18556701030927836,
|
69 |
+
"grad_norm": 0.006616171449422836,
|
70 |
+
"learning_rate": 2e-05,
|
71 |
+
"loss": 0.0262,
|
72 |
+
"step": 18
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.20618556701030927,
|
76 |
+
"grad_norm": 0.005298080388456583,
|
77 |
+
"learning_rate": 2e-05,
|
78 |
+
"loss": 0.0043,
|
79 |
+
"step": 20
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.2268041237113402,
|
83 |
+
"grad_norm": 0.027790764346718788,
|
84 |
+
"learning_rate": 2e-05,
|
85 |
+
"loss": 0.0025,
|
86 |
+
"step": 22
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.24742268041237114,
|
90 |
+
"grad_norm": 0.036290887743234634,
|
91 |
+
"learning_rate": 2e-05,
|
92 |
+
"loss": 0.0025,
|
93 |
+
"step": 24
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.26804123711340205,
|
97 |
+
"grad_norm": 0.002953270450234413,
|
98 |
+
"learning_rate": 2e-05,
|
99 |
+
"loss": 0.0039,
|
100 |
+
"step": 26
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.28865979381443296,
|
104 |
+
"grad_norm": 2.2551519870758057,
|
105 |
+
"learning_rate": 2e-05,
|
106 |
+
"loss": 0.2375,
|
107 |
+
"step": 28
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.30927835051546393,
|
111 |
+
"grad_norm": 0.18529418110847473,
|
112 |
+
"learning_rate": 2e-05,
|
113 |
+
"loss": 0.0103,
|
114 |
+
"step": 30
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.32989690721649484,
|
118 |
+
"grad_norm": 1.886093258857727,
|
119 |
+
"learning_rate": 2e-05,
|
120 |
+
"loss": 0.1782,
|
121 |
+
"step": 32
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.35051546391752575,
|
125 |
+
"grad_norm": 0.22934368252754211,
|
126 |
+
"learning_rate": 2e-05,
|
127 |
+
"loss": 0.0129,
|
128 |
+
"step": 34
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.3711340206185567,
|
132 |
+
"grad_norm": 2.155086040496826,
|
133 |
+
"learning_rate": 2e-05,
|
134 |
+
"loss": 0.2115,
|
135 |
+
"step": 36
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3917525773195876,
|
139 |
+
"grad_norm": 0.00891521479934454,
|
140 |
+
"learning_rate": 2e-05,
|
141 |
+
"loss": 0.0413,
|
142 |
+
"step": 38
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.41237113402061853,
|
146 |
+
"grad_norm": 0.008472035638988018,
|
147 |
+
"learning_rate": 2e-05,
|
148 |
+
"loss": 0.0038,
|
149 |
+
"step": 40
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.4329896907216495,
|
153 |
+
"grad_norm": 0.37362828850746155,
|
154 |
+
"learning_rate": 2e-05,
|
155 |
+
"loss": 0.2332,
|
156 |
+
"step": 42
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.4536082474226804,
|
160 |
+
"grad_norm": 0.011929775588214397,
|
161 |
+
"learning_rate": 2e-05,
|
162 |
+
"loss": 0.001,
|
163 |
+
"step": 44
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.4742268041237113,
|
167 |
+
"grad_norm": 0.006870917044579983,
|
168 |
+
"learning_rate": 2e-05,
|
169 |
+
"loss": 0.0206,
|
170 |
+
"step": 46
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.4948453608247423,
|
174 |
+
"grad_norm": 0.016721265390515327,
|
175 |
+
"learning_rate": 2e-05,
|
176 |
+
"loss": 0.0028,
|
177 |
+
"step": 48
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.5154639175257731,
|
181 |
+
"grad_norm": 0.024881532415747643,
|
182 |
+
"learning_rate": 2e-05,
|
183 |
+
"loss": 0.0027,
|
184 |
+
"step": 50
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.5360824742268041,
|
188 |
+
"grad_norm": 0.1209266409277916,
|
189 |
+
"learning_rate": 2e-05,
|
190 |
+
"loss": 0.8265,
|
191 |
+
"step": 52
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.5567010309278351,
|
195 |
+
"grad_norm": 0.05023783817887306,
|
196 |
+
"learning_rate": 2e-05,
|
197 |
+
"loss": 0.1705,
|
198 |
+
"step": 54
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.5773195876288659,
|
202 |
+
"grad_norm": 0.026383670046925545,
|
203 |
+
"learning_rate": 2e-05,
|
204 |
+
"loss": 0.0019,
|
205 |
+
"step": 56
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.5979381443298969,
|
209 |
+
"grad_norm": 0.0019155156332999468,
|
210 |
+
"learning_rate": 2e-05,
|
211 |
+
"loss": 0.0006,
|
212 |
+
"step": 58
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.6185567010309279,
|
216 |
+
"grad_norm": 0.8070590496063232,
|
217 |
+
"learning_rate": 2e-05,
|
218 |
+
"loss": 0.3128,
|
219 |
+
"step": 60
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.6391752577319587,
|
223 |
+
"grad_norm": 3.526737689971924,
|
224 |
+
"learning_rate": 2e-05,
|
225 |
+
"loss": 0.0995,
|
226 |
+
"step": 62
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.6597938144329897,
|
230 |
+
"grad_norm": 1.8756895065307617,
|
231 |
+
"learning_rate": 2e-05,
|
232 |
+
"loss": 0.2206,
|
233 |
+
"step": 64
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.6804123711340206,
|
237 |
+
"grad_norm": 0.004317080602049828,
|
238 |
+
"learning_rate": 2e-05,
|
239 |
+
"loss": 0.1284,
|
240 |
+
"step": 66
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.7010309278350515,
|
244 |
+
"grad_norm": 0.28428155183792114,
|
245 |
+
"learning_rate": 2e-05,
|
246 |
+
"loss": 0.0199,
|
247 |
+
"step": 68
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.7216494845360825,
|
251 |
+
"grad_norm": 0.006947671063244343,
|
252 |
+
"learning_rate": 2e-05,
|
253 |
+
"loss": 0.0336,
|
254 |
+
"step": 70
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.7422680412371134,
|
258 |
+
"grad_norm": 0.08691083639860153,
|
259 |
+
"learning_rate": 2e-05,
|
260 |
+
"loss": 0.0093,
|
261 |
+
"step": 72
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.7628865979381443,
|
265 |
+
"grad_norm": 0.00405128812417388,
|
266 |
+
"learning_rate": 2e-05,
|
267 |
+
"loss": 0.3603,
|
268 |
+
"step": 74
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.7835051546391752,
|
272 |
+
"grad_norm": 0.00506645767018199,
|
273 |
+
"learning_rate": 2e-05,
|
274 |
+
"loss": 0.0006,
|
275 |
+
"step": 76
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.8041237113402062,
|
279 |
+
"grad_norm": 0.02798837423324585,
|
280 |
+
"learning_rate": 2e-05,
|
281 |
+
"loss": 0.0018,
|
282 |
+
"step": 78
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.8247422680412371,
|
286 |
+
"grad_norm": 0.011478321626782417,
|
287 |
+
"learning_rate": 2e-05,
|
288 |
+
"loss": 0.0014,
|
289 |
+
"step": 80
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.845360824742268,
|
293 |
+
"grad_norm": 0.15240181982517242,
|
294 |
+
"learning_rate": 2e-05,
|
295 |
+
"loss": 0.106,
|
296 |
+
"step": 82
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.865979381443299,
|
300 |
+
"grad_norm": 0.08322691917419434,
|
301 |
+
"learning_rate": 2e-05,
|
302 |
+
"loss": 0.0052,
|
303 |
+
"step": 84
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.8865979381443299,
|
307 |
+
"grad_norm": 6.921684265136719,
|
308 |
+
"learning_rate": 2e-05,
|
309 |
+
"loss": 0.4361,
|
310 |
+
"step": 86
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.9072164948453608,
|
314 |
+
"grad_norm": 0.00928011815994978,
|
315 |
+
"learning_rate": 2e-05,
|
316 |
+
"loss": 0.0099,
|
317 |
+
"step": 88
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.9278350515463918,
|
321 |
+
"grad_norm": 1.7090333700180054,
|
322 |
+
"learning_rate": 2e-05,
|
323 |
+
"loss": 0.1191,
|
324 |
+
"step": 90
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.9484536082474226,
|
328 |
+
"grad_norm": 0.10394242405891418,
|
329 |
+
"learning_rate": 2e-05,
|
330 |
+
"loss": 0.0202,
|
331 |
+
"step": 92
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.9690721649484536,
|
335 |
+
"grad_norm": 0.04304055869579315,
|
336 |
+
"learning_rate": 2e-05,
|
337 |
+
"loss": 0.0018,
|
338 |
+
"step": 94
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.9896907216494846,
|
342 |
+
"grad_norm": 0.20713742077350616,
|
343 |
+
"learning_rate": 2e-05,
|
344 |
+
"loss": 0.0132,
|
345 |
+
"step": 96
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 1.0,
|
349 |
+
"step": 97,
|
350 |
+
"total_flos": 2143940014768128.0,
|
351 |
+
"train_loss": 0.08341487873460829,
|
352 |
+
"train_runtime": 270.9164,
|
353 |
+
"train_samples_per_second": 1.432,
|
354 |
+
"train_steps_per_second": 0.358
|
355 |
+
}
|
356 |
+
],
|
357 |
+
"logging_steps": 2,
|
358 |
+
"max_steps": 97,
|
359 |
+
"num_input_tokens_seen": 0,
|
360 |
+
"num_train_epochs": 1,
|
361 |
+
"save_steps": 500,
|
362 |
+
"stateful_callbacks": {
|
363 |
+
"TrainerControl": {
|
364 |
+
"args": {
|
365 |
+
"should_epoch_stop": false,
|
366 |
+
"should_evaluate": false,
|
367 |
+
"should_log": false,
|
368 |
+
"should_save": false,
|
369 |
+
"should_training_stop": false
|
370 |
+
},
|
371 |
+
"attributes": {}
|
372 |
+
}
|
373 |
+
},
|
374 |
+
"total_flos": 2143940014768128.0,
|
375 |
+
"train_batch_size": 1,
|
376 |
+
"trial_name": null,
|
377 |
+
"trial_params": null
|
378 |
+
}
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round10.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8f7f9d641a604368bf8d0e7f00091e0be9859ac869d4b8054b390fa12177e3f
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round12.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2d403dbdef0fd93809aadc60b9f7ae19c09b3cf80504dcaca4e989520caa280
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round15.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed0479ad680594c42187a2d37e97e45cb9ab21768df8b1d2ad47b75fc9363a3e
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c599dfa725ee23a478848ed8752f9bbe83b3dd481eb78ebf78eb6f1d05c68aac
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52c020b0f544305ea9115132a268be550f348fc8138874cca2695fad374b6040
|
3 |
+
size 794706058
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round20.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ac62e7a55b58292ccb5668dfb8b1e461a0c99f652b9a0735b19dc2923299501
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:478bf976c320e8c5c811bb0fede5b8ddae2aaeb213094fef2bad1f8edd4ae8ff
|
3 |
+
size 794706058
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_client_model_round7.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec3fcadb5bde76ead584df8fbc7c7fc89c48ca61b07f74e41f427c531412c6dd
|
3 |
+
size 794706058
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/2_trainer_state.json
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 97,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.020618556701030927,
|
13 |
+
"grad_norm": 0.07272805273532867,
|
14 |
+
"learning_rate": 2e-05,
|
15 |
+
"loss": 0.1917,
|
16 |
+
"step": 2
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.041237113402061855,
|
20 |
+
"grad_norm": 0.7147791385650635,
|
21 |
+
"learning_rate": 2e-05,
|
22 |
+
"loss": 0.7121,
|
23 |
+
"step": 4
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.061855670103092786,
|
27 |
+
"grad_norm": 3.7243456840515137,
|
28 |
+
"learning_rate": 2e-05,
|
29 |
+
"loss": 1.3252,
|
30 |
+
"step": 6
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.08247422680412371,
|
34 |
+
"grad_norm": 1.936738133430481,
|
35 |
+
"learning_rate": 2e-05,
|
36 |
+
"loss": 1.1401,
|
37 |
+
"step": 8
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.10309278350515463,
|
41 |
+
"grad_norm": 1.6566787958145142,
|
42 |
+
"learning_rate": 2e-05,
|
43 |
+
"loss": 0.7242,
|
44 |
+
"step": 10
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.12371134020618557,
|
48 |
+
"grad_norm": 2.212766408920288,
|
49 |
+
"learning_rate": 2e-05,
|
50 |
+
"loss": 0.4594,
|
51 |
+
"step": 12
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.14432989690721648,
|
55 |
+
"grad_norm": 0.575031042098999,
|
56 |
+
"learning_rate": 2e-05,
|
57 |
+
"loss": 0.4458,
|
58 |
+
"step": 14
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.16494845360824742,
|
62 |
+
"grad_norm": 0.3744853138923645,
|
63 |
+
"learning_rate": 2e-05,
|
64 |
+
"loss": 0.1401,
|
65 |
+
"step": 16
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.18556701030927836,
|
69 |
+
"grad_norm": 2.374159812927246,
|
70 |
+
"learning_rate": 2e-05,
|
71 |
+
"loss": 2.0944,
|
72 |
+
"step": 18
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.20618556701030927,
|
76 |
+
"grad_norm": 2.35343337059021,
|
77 |
+
"learning_rate": 2e-05,
|
78 |
+
"loss": 1.1046,
|
79 |
+
"step": 20
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.2268041237113402,
|
83 |
+
"grad_norm": 1.7141731977462769,
|
84 |
+
"learning_rate": 2e-05,
|
85 |
+
"loss": 0.4615,
|
86 |
+
"step": 22
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.24742268041237114,
|
90 |
+
"grad_norm": 0.3044542968273163,
|
91 |
+
"learning_rate": 2e-05,
|
92 |
+
"loss": 0.1688,
|
93 |
+
"step": 24
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.26804123711340205,
|
97 |
+
"grad_norm": 2.698765754699707,
|
98 |
+
"learning_rate": 2e-05,
|
99 |
+
"loss": 0.9064,
|
100 |
+
"step": 26
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.28865979381443296,
|
104 |
+
"grad_norm": 0.7965202927589417,
|
105 |
+
"learning_rate": 2e-05,
|
106 |
+
"loss": 0.1694,
|
107 |
+
"step": 28
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.30927835051546393,
|
111 |
+
"grad_norm": 1.8384367227554321,
|
112 |
+
"learning_rate": 2e-05,
|
113 |
+
"loss": 1.0756,
|
114 |
+
"step": 30
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.32989690721649484,
|
118 |
+
"grad_norm": 0.3519437909126282,
|
119 |
+
"learning_rate": 2e-05,
|
120 |
+
"loss": 0.2709,
|
121 |
+
"step": 32
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.35051546391752575,
|
125 |
+
"grad_norm": 2.9105238914489746,
|
126 |
+
"learning_rate": 2e-05,
|
127 |
+
"loss": 1.6573,
|
128 |
+
"step": 34
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.3711340206185567,
|
132 |
+
"grad_norm": 0.6958692669868469,
|
133 |
+
"learning_rate": 2e-05,
|
134 |
+
"loss": 0.3584,
|
135 |
+
"step": 36
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3917525773195876,
|
139 |
+
"grad_norm": 0.4390473961830139,
|
140 |
+
"learning_rate": 2e-05,
|
141 |
+
"loss": 0.2845,
|
142 |
+
"step": 38
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.41237113402061853,
|
146 |
+
"grad_norm": 0.6055613160133362,
|
147 |
+
"learning_rate": 2e-05,
|
148 |
+
"loss": 0.3944,
|
149 |
+
"step": 40
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.4329896907216495,
|
153 |
+
"grad_norm": 0.7648600935935974,
|
154 |
+
"learning_rate": 2e-05,
|
155 |
+
"loss": 0.3706,
|
156 |
+
"step": 42
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.4536082474226804,
|
160 |
+
"grad_norm": 2.231801748275757,
|
161 |
+
"learning_rate": 2e-05,
|
162 |
+
"loss": 1.2567,
|
163 |
+
"step": 44
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.4742268041237113,
|
167 |
+
"grad_norm": 0.49553102254867554,
|
168 |
+
"learning_rate": 2e-05,
|
169 |
+
"loss": 0.2562,
|
170 |
+
"step": 46
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.4948453608247423,
|
174 |
+
"grad_norm": 0.6220466494560242,
|
175 |
+
"learning_rate": 2e-05,
|
176 |
+
"loss": 0.8419,
|
177 |
+
"step": 48
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.5154639175257731,
|
181 |
+
"grad_norm": 0.9273183941841125,
|
182 |
+
"learning_rate": 2e-05,
|
183 |
+
"loss": 0.3708,
|
184 |
+
"step": 50
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.5360824742268041,
|
188 |
+
"grad_norm": 1.8283573389053345,
|
189 |
+
"learning_rate": 2e-05,
|
190 |
+
"loss": 0.433,
|
191 |
+
"step": 52
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.5567010309278351,
|
195 |
+
"grad_norm": 1.1112104654312134,
|
196 |
+
"learning_rate": 2e-05,
|
197 |
+
"loss": 0.366,
|
198 |
+
"step": 54
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.5773195876288659,
|
202 |
+
"grad_norm": 0.7584994435310364,
|
203 |
+
"learning_rate": 2e-05,
|
204 |
+
"loss": 0.2889,
|
205 |
+
"step": 56
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.5979381443298969,
|
209 |
+
"grad_norm": 2.2555992603302,
|
210 |
+
"learning_rate": 2e-05,
|
211 |
+
"loss": 0.8371,
|
212 |
+
"step": 58
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.6185567010309279,
|
216 |
+
"grad_norm": 0.22240255773067474,
|
217 |
+
"learning_rate": 2e-05,
|
218 |
+
"loss": 0.3454,
|
219 |
+
"step": 60
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.6391752577319587,
|
223 |
+
"grad_norm": 0.875527024269104,
|
224 |
+
"learning_rate": 2e-05,
|
225 |
+
"loss": 0.6807,
|
226 |
+
"step": 62
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.6597938144329897,
|
230 |
+
"grad_norm": 1.1137892007827759,
|
231 |
+
"learning_rate": 2e-05,
|
232 |
+
"loss": 0.3103,
|
233 |
+
"step": 64
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.6804123711340206,
|
237 |
+
"grad_norm": 0.6312543749809265,
|
238 |
+
"learning_rate": 2e-05,
|
239 |
+
"loss": 0.4288,
|
240 |
+
"step": 66
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.7010309278350515,
|
244 |
+
"grad_norm": 0.4749165177345276,
|
245 |
+
"learning_rate": 2e-05,
|
246 |
+
"loss": 0.1926,
|
247 |
+
"step": 68
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.7216494845360825,
|
251 |
+
"grad_norm": 0.6268261671066284,
|
252 |
+
"learning_rate": 2e-05,
|
253 |
+
"loss": 0.3698,
|
254 |
+
"step": 70
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.7422680412371134,
|
258 |
+
"grad_norm": 2.9238345623016357,
|
259 |
+
"learning_rate": 2e-05,
|
260 |
+
"loss": 1.5883,
|
261 |
+
"step": 72
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.7628865979381443,
|
265 |
+
"grad_norm": 2.378034830093384,
|
266 |
+
"learning_rate": 2e-05,
|
267 |
+
"loss": 1.1422,
|
268 |
+
"step": 74
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.7835051546391752,
|
272 |
+
"grad_norm": 0.7543140053749084,
|
273 |
+
"learning_rate": 2e-05,
|
274 |
+
"loss": 0.4975,
|
275 |
+
"step": 76
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.8041237113402062,
|
279 |
+
"grad_norm": 2.226060152053833,
|
280 |
+
"learning_rate": 2e-05,
|
281 |
+
"loss": 0.4604,
|
282 |
+
"step": 78
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.8247422680412371,
|
286 |
+
"grad_norm": 0.058547962456941605,
|
287 |
+
"learning_rate": 2e-05,
|
288 |
+
"loss": 0.9361,
|
289 |
+
"step": 80
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.845360824742268,
|
293 |
+
"grad_norm": 0.3901329040527344,
|
294 |
+
"learning_rate": 2e-05,
|
295 |
+
"loss": 0.1839,
|
296 |
+
"step": 82
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.865979381443299,
|
300 |
+
"grad_norm": 0.6894042491912842,
|
301 |
+
"learning_rate": 2e-05,
|
302 |
+
"loss": 0.5983,
|
303 |
+
"step": 84
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.8865979381443299,
|
307 |
+
"grad_norm": 0.11256992816925049,
|
308 |
+
"learning_rate": 2e-05,
|
309 |
+
"loss": 1.1,
|
310 |
+
"step": 86
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.9072164948453608,
|
314 |
+
"grad_norm": 1.3978410959243774,
|
315 |
+
"learning_rate": 2e-05,
|
316 |
+
"loss": 0.7952,
|
317 |
+
"step": 88
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.9278350515463918,
|
321 |
+
"grad_norm": 1.851767659187317,
|
322 |
+
"learning_rate": 2e-05,
|
323 |
+
"loss": 0.7418,
|
324 |
+
"step": 90
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.9484536082474226,
|
328 |
+
"grad_norm": 0.6821492910385132,
|
329 |
+
"learning_rate": 2e-05,
|
330 |
+
"loss": 0.2547,
|
331 |
+
"step": 92
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.9690721649484536,
|
335 |
+
"grad_norm": 0.10563112795352936,
|
336 |
+
"learning_rate": 2e-05,
|
337 |
+
"loss": 0.1207,
|
338 |
+
"step": 94
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.9896907216494846,
|
342 |
+
"grad_norm": 1.140236735343933,
|
343 |
+
"learning_rate": 2e-05,
|
344 |
+
"loss": 1.6027,
|
345 |
+
"step": 96
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 1.0,
|
349 |
+
"step": 97,
|
350 |
+
"total_flos": 5055426831843328.0,
|
351 |
+
"train_loss": 0.6651097327163539,
|
352 |
+
"train_runtime": 434.3501,
|
353 |
+
"train_samples_per_second": 0.893,
|
354 |
+
"train_steps_per_second": 0.223
|
355 |
+
}
|
356 |
+
],
|
357 |
+
"logging_steps": 2,
|
358 |
+
"max_steps": 97,
|
359 |
+
"num_input_tokens_seen": 0,
|
360 |
+
"num_train_epochs": 1,
|
361 |
+
"save_steps": 500,
|
362 |
+
"stateful_callbacks": {
|
363 |
+
"TrainerControl": {
|
364 |
+
"args": {
|
365 |
+
"should_epoch_stop": false,
|
366 |
+
"should_evaluate": false,
|
367 |
+
"should_log": false,
|
368 |
+
"should_save": false,
|
369 |
+
"should_training_stop": false
|
370 |
+
},
|
371 |
+
"attributes": {}
|
372 |
+
}
|
373 |
+
},
|
374 |
+
"total_flos": 5055426831843328.0,
|
375 |
+
"train_batch_size": 1,
|
376 |
+
"trial_name": null,
|
377 |
+
"trial_params": null
|
378 |
+
}
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round10.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b8821c785135aa7681f831baee4c8faa6da0bf34abcf61a74c878a10678fc5a
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round12.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:18fa74d12e99610bc0ba01fcc22126ffaa1e7889f91ec44904c61c9a84bb0197
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round15.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:076e147af133bf2506e43a1eaf6d6c2f0188696b68debb8e594ea3dc13963f30
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:405a042f3c8bba329b51f7d518f0dfc03f6e32f11911d86934dd301ce82bb2f6
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c0be93ffc19967965abe2fef04a4f9b6c4b7bf10e0659f09f5704bbbc23cc55
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round20.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ff9c1bac57727b36b4f19b0a5528833b27575970e5abb9481eef6061d65761d
|
3 |
+
size 369838470
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:988261b37d63335deb0ab84f3c8d6341bbb6c1433375cf8e6799d997da572465
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_client_model_round7.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4cf5a7e90df055317a883596df00673fbebe9169a545fa5a2123618e7c68207
|
3 |
+
size 369837282
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/3_trainer_state.json
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 97,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.020618556701030927,
|
13 |
+
"grad_norm": 3.5025784969329834,
|
14 |
+
"learning_rate": 2e-05,
|
15 |
+
"loss": 0.6125,
|
16 |
+
"step": 2
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.041237113402061855,
|
20 |
+
"grad_norm": 1.6424089670181274,
|
21 |
+
"learning_rate": 2e-05,
|
22 |
+
"loss": 1.3566,
|
23 |
+
"step": 4
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.061855670103092786,
|
27 |
+
"grad_norm": 4.314863681793213,
|
28 |
+
"learning_rate": 2e-05,
|
29 |
+
"loss": 1.3257,
|
30 |
+
"step": 6
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.08247422680412371,
|
34 |
+
"grad_norm": 1.1254048347473145,
|
35 |
+
"learning_rate": 2e-05,
|
36 |
+
"loss": 2.5388,
|
37 |
+
"step": 8
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.10309278350515463,
|
41 |
+
"grad_norm": 1.760192632675171,
|
42 |
+
"learning_rate": 2e-05,
|
43 |
+
"loss": 1.3429,
|
44 |
+
"step": 10
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.12371134020618557,
|
48 |
+
"grad_norm": 2.004739761352539,
|
49 |
+
"learning_rate": 2e-05,
|
50 |
+
"loss": 1.2,
|
51 |
+
"step": 12
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.14432989690721648,
|
55 |
+
"grad_norm": 1.0539671182632446,
|
56 |
+
"learning_rate": 2e-05,
|
57 |
+
"loss": 0.6582,
|
58 |
+
"step": 14
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.16494845360824742,
|
62 |
+
"grad_norm": 4.009338855743408,
|
63 |
+
"learning_rate": 2e-05,
|
64 |
+
"loss": 1.0883,
|
65 |
+
"step": 16
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.18556701030927836,
|
69 |
+
"grad_norm": 4.920736312866211,
|
70 |
+
"learning_rate": 2e-05,
|
71 |
+
"loss": 0.8855,
|
72 |
+
"step": 18
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.20618556701030927,
|
76 |
+
"grad_norm": 2.5970003604888916,
|
77 |
+
"learning_rate": 2e-05,
|
78 |
+
"loss": 1.0158,
|
79 |
+
"step": 20
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.2268041237113402,
|
83 |
+
"grad_norm": 2.9520134925842285,
|
84 |
+
"learning_rate": 2e-05,
|
85 |
+
"loss": 0.726,
|
86 |
+
"step": 22
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.24742268041237114,
|
90 |
+
"grad_norm": 1.7787227630615234,
|
91 |
+
"learning_rate": 2e-05,
|
92 |
+
"loss": 1.116,
|
93 |
+
"step": 24
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.26804123711340205,
|
97 |
+
"grad_norm": 3.36810040473938,
|
98 |
+
"learning_rate": 2e-05,
|
99 |
+
"loss": 1.2355,
|
100 |
+
"step": 26
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.28865979381443296,
|
104 |
+
"grad_norm": 0.9433608651161194,
|
105 |
+
"learning_rate": 2e-05,
|
106 |
+
"loss": 0.6505,
|
107 |
+
"step": 28
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.30927835051546393,
|
111 |
+
"grad_norm": 2.1001124382019043,
|
112 |
+
"learning_rate": 2e-05,
|
113 |
+
"loss": 0.7527,
|
114 |
+
"step": 30
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.32989690721649484,
|
118 |
+
"grad_norm": 2.966327428817749,
|
119 |
+
"learning_rate": 2e-05,
|
120 |
+
"loss": 1.2433,
|
121 |
+
"step": 32
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.35051546391752575,
|
125 |
+
"grad_norm": 3.646400213241577,
|
126 |
+
"learning_rate": 2e-05,
|
127 |
+
"loss": 1.3473,
|
128 |
+
"step": 34
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.3711340206185567,
|
132 |
+
"grad_norm": 1.8998469114303589,
|
133 |
+
"learning_rate": 2e-05,
|
134 |
+
"loss": 0.7622,
|
135 |
+
"step": 36
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3917525773195876,
|
139 |
+
"grad_norm": 0.9765092134475708,
|
140 |
+
"learning_rate": 2e-05,
|
141 |
+
"loss": 0.7605,
|
142 |
+
"step": 38
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.41237113402061853,
|
146 |
+
"grad_norm": 2.9249725341796875,
|
147 |
+
"learning_rate": 2e-05,
|
148 |
+
"loss": 0.8117,
|
149 |
+
"step": 40
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.4329896907216495,
|
153 |
+
"grad_norm": 1.4821889400482178,
|
154 |
+
"learning_rate": 2e-05,
|
155 |
+
"loss": 0.4398,
|
156 |
+
"step": 42
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.4536082474226804,
|
160 |
+
"grad_norm": 1.9430099725723267,
|
161 |
+
"learning_rate": 2e-05,
|
162 |
+
"loss": 0.6923,
|
163 |
+
"step": 44
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.4742268041237113,
|
167 |
+
"grad_norm": 1.3418391942977905,
|
168 |
+
"learning_rate": 2e-05,
|
169 |
+
"loss": 0.4962,
|
170 |
+
"step": 46
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.4948453608247423,
|
174 |
+
"grad_norm": 1.5497465133666992,
|
175 |
+
"learning_rate": 2e-05,
|
176 |
+
"loss": 0.7783,
|
177 |
+
"step": 48
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.5154639175257731,
|
181 |
+
"grad_norm": 4.256335258483887,
|
182 |
+
"learning_rate": 2e-05,
|
183 |
+
"loss": 1.774,
|
184 |
+
"step": 50
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.5360824742268041,
|
188 |
+
"grad_norm": 1.0099101066589355,
|
189 |
+
"learning_rate": 2e-05,
|
190 |
+
"loss": 1.0536,
|
191 |
+
"step": 52
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.5567010309278351,
|
195 |
+
"grad_norm": 1.356179118156433,
|
196 |
+
"learning_rate": 2e-05,
|
197 |
+
"loss": 0.9636,
|
198 |
+
"step": 54
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.5773195876288659,
|
202 |
+
"grad_norm": 4.572417259216309,
|
203 |
+
"learning_rate": 2e-05,
|
204 |
+
"loss": 1.398,
|
205 |
+
"step": 56
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.5979381443298969,
|
209 |
+
"grad_norm": 4.678338050842285,
|
210 |
+
"learning_rate": 2e-05,
|
211 |
+
"loss": 2.1435,
|
212 |
+
"step": 58
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.6185567010309279,
|
216 |
+
"grad_norm": 3.575714588165283,
|
217 |
+
"learning_rate": 2e-05,
|
218 |
+
"loss": 1.9645,
|
219 |
+
"step": 60
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.6391752577319587,
|
223 |
+
"grad_norm": 2.5277392864227295,
|
224 |
+
"learning_rate": 2e-05,
|
225 |
+
"loss": 0.6022,
|
226 |
+
"step": 62
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.6597938144329897,
|
230 |
+
"grad_norm": 3.385993003845215,
|
231 |
+
"learning_rate": 2e-05,
|
232 |
+
"loss": 0.7407,
|
233 |
+
"step": 64
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.6804123711340206,
|
237 |
+
"grad_norm": 5.360639572143555,
|
238 |
+
"learning_rate": 2e-05,
|
239 |
+
"loss": 2.6118,
|
240 |
+
"step": 66
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.7010309278350515,
|
244 |
+
"grad_norm": 6.285584926605225,
|
245 |
+
"learning_rate": 2e-05,
|
246 |
+
"loss": 1.1991,
|
247 |
+
"step": 68
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.7216494845360825,
|
251 |
+
"grad_norm": 3.385345697402954,
|
252 |
+
"learning_rate": 2e-05,
|
253 |
+
"loss": 1.0076,
|
254 |
+
"step": 70
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.7422680412371134,
|
258 |
+
"grad_norm": 2.7984771728515625,
|
259 |
+
"learning_rate": 2e-05,
|
260 |
+
"loss": 0.8717,
|
261 |
+
"step": 72
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.7628865979381443,
|
265 |
+
"grad_norm": 1.8088620901107788,
|
266 |
+
"learning_rate": 2e-05,
|
267 |
+
"loss": 1.9135,
|
268 |
+
"step": 74
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.7835051546391752,
|
272 |
+
"grad_norm": 1.2181267738342285,
|
273 |
+
"learning_rate": 2e-05,
|
274 |
+
"loss": 1.112,
|
275 |
+
"step": 76
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.8041237113402062,
|
279 |
+
"grad_norm": 2.389383316040039,
|
280 |
+
"learning_rate": 2e-05,
|
281 |
+
"loss": 0.896,
|
282 |
+
"step": 78
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.8247422680412371,
|
286 |
+
"grad_norm": 1.2696701288223267,
|
287 |
+
"learning_rate": 2e-05,
|
288 |
+
"loss": 0.5581,
|
289 |
+
"step": 80
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.845360824742268,
|
293 |
+
"grad_norm": 4.296065330505371,
|
294 |
+
"learning_rate": 2e-05,
|
295 |
+
"loss": 1.6235,
|
296 |
+
"step": 82
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.865979381443299,
|
300 |
+
"grad_norm": 5.560433387756348,
|
301 |
+
"learning_rate": 2e-05,
|
302 |
+
"loss": 1.7516,
|
303 |
+
"step": 84
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.8865979381443299,
|
307 |
+
"grad_norm": 1.6488584280014038,
|
308 |
+
"learning_rate": 2e-05,
|
309 |
+
"loss": 0.7037,
|
310 |
+
"step": 86
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.9072164948453608,
|
314 |
+
"grad_norm": 1.2957154512405396,
|
315 |
+
"learning_rate": 2e-05,
|
316 |
+
"loss": 0.5997,
|
317 |
+
"step": 88
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.9278350515463918,
|
321 |
+
"grad_norm": 2.889270305633545,
|
322 |
+
"learning_rate": 2e-05,
|
323 |
+
"loss": 1.9081,
|
324 |
+
"step": 90
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.9484536082474226,
|
328 |
+
"grad_norm": 3.077247381210327,
|
329 |
+
"learning_rate": 2e-05,
|
330 |
+
"loss": 0.8997,
|
331 |
+
"step": 92
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.9690721649484536,
|
335 |
+
"grad_norm": 4.6420979499816895,
|
336 |
+
"learning_rate": 2e-05,
|
337 |
+
"loss": 0.8319,
|
338 |
+
"step": 94
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.9896907216494846,
|
342 |
+
"grad_norm": 2.454413890838623,
|
343 |
+
"learning_rate": 2e-05,
|
344 |
+
"loss": 0.9373,
|
345 |
+
"step": 96
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 1.0,
|
349 |
+
"step": 97,
|
350 |
+
"total_flos": 2171896458117120.0,
|
351 |
+
"train_loss": 1.122876275445997,
|
352 |
+
"train_runtime": 270.0103,
|
353 |
+
"train_samples_per_second": 1.437,
|
354 |
+
"train_steps_per_second": 0.359
|
355 |
+
}
|
356 |
+
],
|
357 |
+
"logging_steps": 2,
|
358 |
+
"max_steps": 97,
|
359 |
+
"num_input_tokens_seen": 0,
|
360 |
+
"num_train_epochs": 1,
|
361 |
+
"save_steps": 500,
|
362 |
+
"stateful_callbacks": {
|
363 |
+
"TrainerControl": {
|
364 |
+
"args": {
|
365 |
+
"should_epoch_stop": false,
|
366 |
+
"should_evaluate": false,
|
367 |
+
"should_log": false,
|
368 |
+
"should_save": false,
|
369 |
+
"should_training_stop": false
|
370 |
+
},
|
371 |
+
"attributes": {}
|
372 |
+
}
|
373 |
+
},
|
374 |
+
"total_flos": 2171896458117120.0,
|
375 |
+
"train_batch_size": 1,
|
376 |
+
"trial_name": null,
|
377 |
+
"trial_params": null
|
378 |
+
}
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round10.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23387506e7ecf8f3251f0025829a56baa1bf2f769ac9bfa4fdcc90e01a070768
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round12.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0acdddfcb37d7b530feeec8001857fec2a659c79f2cf1f4c55c44a9f66e5ae63
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round15.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cede0a0fcd2006ffaa32ece4315df4e5adb2865e91e0f13e75df63d528db0cb2
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6eb53b8d130aba5244f53331294cf05558d5bfa3cc6c8190cc13b230d1d64a46
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c08a07e6cd3316c5ec6b2ccd96a9617354d3e87001a2805ad3d3e3f83f88291
|
3 |
+
size 794706058
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round20.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0592e6c0f8a48ad175749a2b569c9f722087166f0892e342960007825ba90a03
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4df776933e0bbc0cdc681ab9592b09d9fe10f92c523f0383b464947c0e593677
|
3 |
+
size 794706058
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_client_model_round7.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f820f6c817a919f8f9d184c0b292b5664efef2f06579de05ae74f87c093aa64
|
3 |
+
size 794706058
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/4_trainer_state.json
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 97,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.020618556701030927,
|
13 |
+
"grad_norm": 2.518491506576538,
|
14 |
+
"learning_rate": 2e-05,
|
15 |
+
"loss": 0.8582,
|
16 |
+
"step": 2
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.041237113402061855,
|
20 |
+
"grad_norm": 1.6445577144622803,
|
21 |
+
"learning_rate": 2e-05,
|
22 |
+
"loss": 1.1082,
|
23 |
+
"step": 4
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.061855670103092786,
|
27 |
+
"grad_norm": 1.276530146598816,
|
28 |
+
"learning_rate": 2e-05,
|
29 |
+
"loss": 0.9062,
|
30 |
+
"step": 6
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.08247422680412371,
|
34 |
+
"grad_norm": 1.4562397003173828,
|
35 |
+
"learning_rate": 2e-05,
|
36 |
+
"loss": 1.2294,
|
37 |
+
"step": 8
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.10309278350515463,
|
41 |
+
"grad_norm": 1.4720100164413452,
|
42 |
+
"learning_rate": 2e-05,
|
43 |
+
"loss": 0.4722,
|
44 |
+
"step": 10
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.12371134020618557,
|
48 |
+
"grad_norm": 0.8212082386016846,
|
49 |
+
"learning_rate": 2e-05,
|
50 |
+
"loss": 0.3804,
|
51 |
+
"step": 12
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.14432989690721648,
|
55 |
+
"grad_norm": 1.2318272590637207,
|
56 |
+
"learning_rate": 2e-05,
|
57 |
+
"loss": 0.4925,
|
58 |
+
"step": 14
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.16494845360824742,
|
62 |
+
"grad_norm": 1.513741374015808,
|
63 |
+
"learning_rate": 2e-05,
|
64 |
+
"loss": 0.9536,
|
65 |
+
"step": 16
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.18556701030927836,
|
69 |
+
"grad_norm": 0.5804106593132019,
|
70 |
+
"learning_rate": 2e-05,
|
71 |
+
"loss": 0.8147,
|
72 |
+
"step": 18
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.20618556701030927,
|
76 |
+
"grad_norm": 1.2517503499984741,
|
77 |
+
"learning_rate": 2e-05,
|
78 |
+
"loss": 0.4827,
|
79 |
+
"step": 20
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.2268041237113402,
|
83 |
+
"grad_norm": 0.6721848249435425,
|
84 |
+
"learning_rate": 2e-05,
|
85 |
+
"loss": 0.3412,
|
86 |
+
"step": 22
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.24742268041237114,
|
90 |
+
"grad_norm": 1.1781041622161865,
|
91 |
+
"learning_rate": 2e-05,
|
92 |
+
"loss": 0.872,
|
93 |
+
"step": 24
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.26804123711340205,
|
97 |
+
"grad_norm": 1.1887950897216797,
|
98 |
+
"learning_rate": 2e-05,
|
99 |
+
"loss": 0.5626,
|
100 |
+
"step": 26
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.28865979381443296,
|
104 |
+
"grad_norm": 1.4811246395111084,
|
105 |
+
"learning_rate": 2e-05,
|
106 |
+
"loss": 0.4312,
|
107 |
+
"step": 28
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.30927835051546393,
|
111 |
+
"grad_norm": 1.3950275182724,
|
112 |
+
"learning_rate": 2e-05,
|
113 |
+
"loss": 0.5554,
|
114 |
+
"step": 30
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.32989690721649484,
|
118 |
+
"grad_norm": 0.8101319670677185,
|
119 |
+
"learning_rate": 2e-05,
|
120 |
+
"loss": 0.2638,
|
121 |
+
"step": 32
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.35051546391752575,
|
125 |
+
"grad_norm": 1.3041068315505981,
|
126 |
+
"learning_rate": 2e-05,
|
127 |
+
"loss": 1.154,
|
128 |
+
"step": 34
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.3711340206185567,
|
132 |
+
"grad_norm": 1.0701684951782227,
|
133 |
+
"learning_rate": 2e-05,
|
134 |
+
"loss": 0.7192,
|
135 |
+
"step": 36
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3917525773195876,
|
139 |
+
"grad_norm": 1.6231181621551514,
|
140 |
+
"learning_rate": 2e-05,
|
141 |
+
"loss": 0.7025,
|
142 |
+
"step": 38
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.41237113402061853,
|
146 |
+
"grad_norm": 1.8748000860214233,
|
147 |
+
"learning_rate": 2e-05,
|
148 |
+
"loss": 1.6573,
|
149 |
+
"step": 40
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.4329896907216495,
|
153 |
+
"grad_norm": 0.7263919711112976,
|
154 |
+
"learning_rate": 2e-05,
|
155 |
+
"loss": 0.8162,
|
156 |
+
"step": 42
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.4536082474226804,
|
160 |
+
"grad_norm": 1.2520265579223633,
|
161 |
+
"learning_rate": 2e-05,
|
162 |
+
"loss": 0.8266,
|
163 |
+
"step": 44
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.4742268041237113,
|
167 |
+
"grad_norm": 0.34067195653915405,
|
168 |
+
"learning_rate": 2e-05,
|
169 |
+
"loss": 0.4451,
|
170 |
+
"step": 46
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.4948453608247423,
|
174 |
+
"grad_norm": 1.4540058374404907,
|
175 |
+
"learning_rate": 2e-05,
|
176 |
+
"loss": 0.6771,
|
177 |
+
"step": 48
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.5154639175257731,
|
181 |
+
"grad_norm": 1.0151292085647583,
|
182 |
+
"learning_rate": 2e-05,
|
183 |
+
"loss": 0.8007,
|
184 |
+
"step": 50
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.5360824742268041,
|
188 |
+
"grad_norm": 1.1358588933944702,
|
189 |
+
"learning_rate": 2e-05,
|
190 |
+
"loss": 0.2688,
|
191 |
+
"step": 52
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.5567010309278351,
|
195 |
+
"grad_norm": 0.9416270852088928,
|
196 |
+
"learning_rate": 2e-05,
|
197 |
+
"loss": 0.431,
|
198 |
+
"step": 54
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.5773195876288659,
|
202 |
+
"grad_norm": 1.288041591644287,
|
203 |
+
"learning_rate": 2e-05,
|
204 |
+
"loss": 0.3753,
|
205 |
+
"step": 56
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.5979381443298969,
|
209 |
+
"grad_norm": 0.13528066873550415,
|
210 |
+
"learning_rate": 2e-05,
|
211 |
+
"loss": 0.1003,
|
212 |
+
"step": 58
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.6185567010309279,
|
216 |
+
"grad_norm": 1.8311398029327393,
|
217 |
+
"learning_rate": 2e-05,
|
218 |
+
"loss": 0.8292,
|
219 |
+
"step": 60
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.6391752577319587,
|
223 |
+
"grad_norm": 2.2910356521606445,
|
224 |
+
"learning_rate": 2e-05,
|
225 |
+
"loss": 0.7834,
|
226 |
+
"step": 62
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.6597938144329897,
|
230 |
+
"grad_norm": 0.40395867824554443,
|
231 |
+
"learning_rate": 2e-05,
|
232 |
+
"loss": 0.5579,
|
233 |
+
"step": 64
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.6804123711340206,
|
237 |
+
"grad_norm": 0.6555685997009277,
|
238 |
+
"learning_rate": 2e-05,
|
239 |
+
"loss": 1.3677,
|
240 |
+
"step": 66
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.7010309278350515,
|
244 |
+
"grad_norm": 0.7282531261444092,
|
245 |
+
"learning_rate": 2e-05,
|
246 |
+
"loss": 0.8838,
|
247 |
+
"step": 68
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.7216494845360825,
|
251 |
+
"grad_norm": 1.5124473571777344,
|
252 |
+
"learning_rate": 2e-05,
|
253 |
+
"loss": 0.5091,
|
254 |
+
"step": 70
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.7422680412371134,
|
258 |
+
"grad_norm": 1.059186339378357,
|
259 |
+
"learning_rate": 2e-05,
|
260 |
+
"loss": 0.228,
|
261 |
+
"step": 72
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.7628865979381443,
|
265 |
+
"grad_norm": 1.1664392948150635,
|
266 |
+
"learning_rate": 2e-05,
|
267 |
+
"loss": 0.641,
|
268 |
+
"step": 74
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.7835051546391752,
|
272 |
+
"grad_norm": 0.772824764251709,
|
273 |
+
"learning_rate": 2e-05,
|
274 |
+
"loss": 0.1895,
|
275 |
+
"step": 76
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.8041237113402062,
|
279 |
+
"grad_norm": 0.9583086371421814,
|
280 |
+
"learning_rate": 2e-05,
|
281 |
+
"loss": 0.5631,
|
282 |
+
"step": 78
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.8247422680412371,
|
286 |
+
"grad_norm": 2.7325258255004883,
|
287 |
+
"learning_rate": 2e-05,
|
288 |
+
"loss": 1.264,
|
289 |
+
"step": 80
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.845360824742268,
|
293 |
+
"grad_norm": 0.539401650428772,
|
294 |
+
"learning_rate": 2e-05,
|
295 |
+
"loss": 0.22,
|
296 |
+
"step": 82
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.865979381443299,
|
300 |
+
"grad_norm": 1.19405996799469,
|
301 |
+
"learning_rate": 2e-05,
|
302 |
+
"loss": 0.6883,
|
303 |
+
"step": 84
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.8865979381443299,
|
307 |
+
"grad_norm": 1.0464004278182983,
|
308 |
+
"learning_rate": 2e-05,
|
309 |
+
"loss": 0.5503,
|
310 |
+
"step": 86
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.9072164948453608,
|
314 |
+
"grad_norm": 2.1325461864471436,
|
315 |
+
"learning_rate": 2e-05,
|
316 |
+
"loss": 1.0865,
|
317 |
+
"step": 88
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.9278350515463918,
|
321 |
+
"grad_norm": 2.034447431564331,
|
322 |
+
"learning_rate": 2e-05,
|
323 |
+
"loss": 0.6971,
|
324 |
+
"step": 90
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.9484536082474226,
|
328 |
+
"grad_norm": 1.3602426052093506,
|
329 |
+
"learning_rate": 2e-05,
|
330 |
+
"loss": 1.1031,
|
331 |
+
"step": 92
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.9690721649484536,
|
335 |
+
"grad_norm": 0.8754698634147644,
|
336 |
+
"learning_rate": 2e-05,
|
337 |
+
"loss": 0.374,
|
338 |
+
"step": 94
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.9896907216494846,
|
342 |
+
"grad_norm": 1.953580617904663,
|
343 |
+
"learning_rate": 2e-05,
|
344 |
+
"loss": 0.7177,
|
345 |
+
"step": 96
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 1.0,
|
349 |
+
"step": 97,
|
350 |
+
"total_flos": 5852858146619392.0,
|
351 |
+
"train_loss": 0.6841668984324661,
|
352 |
+
"train_runtime": 435.2797,
|
353 |
+
"train_samples_per_second": 0.891,
|
354 |
+
"train_steps_per_second": 0.223
|
355 |
+
}
|
356 |
+
],
|
357 |
+
"logging_steps": 2,
|
358 |
+
"max_steps": 97,
|
359 |
+
"num_input_tokens_seen": 0,
|
360 |
+
"num_train_epochs": 1,
|
361 |
+
"save_steps": 500,
|
362 |
+
"stateful_callbacks": {
|
363 |
+
"TrainerControl": {
|
364 |
+
"args": {
|
365 |
+
"should_epoch_stop": false,
|
366 |
+
"should_evaluate": false,
|
367 |
+
"should_log": false,
|
368 |
+
"should_save": false,
|
369 |
+
"should_training_stop": false
|
370 |
+
},
|
371 |
+
"attributes": {}
|
372 |
+
}
|
373 |
+
},
|
374 |
+
"total_flos": 5852858146619392.0,
|
375 |
+
"train_batch_size": 1,
|
376 |
+
"trial_name": null,
|
377 |
+
"trial_params": null
|
378 |
+
}
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round10.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:872bf8692b034370c1c4c0f005922a1a8759668f6bd24ab1b7a8b31e15575065
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round12.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:136f7c89ceb0d2c2c8c383833ddae35cdb2e3237ee03fcb28b38f411bfd09a50
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round15.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b964fe05e5bbbea46bfb3de2d1319ba929f4bbf38e97ba47c990b8674af5305d
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:adf2d32495d65af59172b9ad8c17f0e066399ac7f6494c618e3b788278339274
|
3 |
+
size 794708086
|
client_states_feddualMultipqfullfreeze_homoAgg_moe_NOCONT_bs4_saveoptim_lr2e-5_sc315_4tasks_5rounds_fixitr97_T0125_decay099/5_client_model_round2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88ffa0b5cd6e5c1d97b79db546d8c18cacf42c828b5cba45f2b950fea5897c56
|
3 |
+
size 794706058
|