bugfix in loading and data processing

Browse files

Files changed (8) hide show

config.json +1 -1
generation_config.json +1 -1
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
model.safetensors.index.json +0 -1
tokenizer_config.json +0 -1
trainer_state.json +51 -51
training_args.bin +2 -2

config.json CHANGED Viewed

@@ -34,7 +34,7 @@
   "rope_theta": 500000.0,
   "tie_word_embeddings": true,
   "torch_dtype": "float32",
-  "transformers_version": "4.47.1",
   "use_cache": true,
   "vocab_size": 128256
 }

   "rope_theta": 500000.0,
   "tie_word_embeddings": true,
   "torch_dtype": "float32",
+  "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 128256
 }

generation_config.json CHANGED Viewed

@@ -8,5 +8,5 @@
   ],
   "temperature": 0.6,
   "top_p": 0.9,
-  "transformers_version": "4.47.1"
 }

   ],
   "temperature": 0.6,
   "top_p": 0.9,
+  "transformers_version": "4.45.2"
 }

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3c3d55e6bbab3c2d2030bda1984d74ec22f23734927494f6780dd2cac85a342a
 size 4943274328

 version https://git-lfs.github.com/spec/v1
+oid sha256:d63117c4a0c4f755f31e7b16f956ddfd3d6efc4d1b1084f3da431d5831d47277
 size 4943274328

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d72811afaf35edb59ed41df2e4ae7c481e25bed3a4a5459c41264eaefefcf742
 size 1050673280

 version https://git-lfs.github.com/spec/v1
+oid sha256:29a18fc6ed6ce7e2ce393b4d692e182e3aae9e85a17d1b62520dc07b25529726
 size 1050673280

model.safetensors.index.json CHANGED Viewed

@@ -3,7 +3,6 @@
     "total_size": 5993930752
   },
   "weight_map": {
-    "lm_head.weight": "model-00002-of-00002.safetensors",
     "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
     "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
     "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",

     "total_size": 5993930752
   },
   "weight_map": {
     "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
     "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
     "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",

tokenizer_config.json CHANGED Viewed

@@ -2053,7 +2053,6 @@
   "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now(\"%d %b %Y\") %}\n    {%- else %}\n        {%- set date_string = \"26 Jul 2024\" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n        {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n        {{- '\"parameters\": ' }}\n        {{- tool_call.arguments | tojson }}\n        {{- \"}\" }}\n        {{- \"<|eot_id|>\" }}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|eot_id|>",
-  "extra_special_tokens": {},
   "model_input_names": [
     "input_ids",
     "attention_mask"

   "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now(\"%d %b %Y\") %}\n    {%- else %}\n        {%- set date_string = \"26 Jul 2024\" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n        {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n        {{- '\"parameters\": ' }}\n        {{- tool_call.arguments | tojson }}\n        {{- \"}\" }}\n        {{- \"<|eot_id|>\" }}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|eot_id|>",
   "model_input_names": [
     "input_ids",
     "attention_mask"

trainer_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 2.912280701754386,
   "eval_steps": 500,
   "global_step": 84,
   "is_hyper_param_search": false,
@@ -10,124 +10,124 @@
   "log_history": [
     {
       "epoch": 0.17543859649122806,
-      "grad_norm": 8.686567306518555,
       "learning_rate": 1.996992941167792e-05,
-      "loss": 1.0892,
       "step": 5
     },
     {
       "epoch": 0.3508771929824561,
-      "grad_norm": 2.756136178970337,
       "learning_rate": 1.9633708786158803e-05,
-      "loss": 0.6606,
       "step": 10
     },
     {
       "epoch": 0.5263157894736842,
-      "grad_norm": 2.106504201889038,
       "learning_rate": 1.8936326403234125e-05,
-      "loss": 0.5273,
       "step": 15
     },
     {
       "epoch": 0.7017543859649122,
-      "grad_norm": 1.923587679862976,
       "learning_rate": 1.7903926695187595e-05,
-      "loss": 0.5017,
       "step": 20
     },
     {
       "epoch": 0.8771929824561403,
-      "grad_norm": 1.7514512538909912,
       "learning_rate": 1.657521368569064e-05,
-      "loss": 0.4245,
       "step": 25
     },
     {
-      "epoch": 1.0350877192982457,
-      "grad_norm": 1.9517344236373901,
       "learning_rate": 1.5000000000000002e-05,
-      "loss": 0.4167,
       "step": 30
     },
     {
-      "epoch": 1.2105263157894737,
-      "grad_norm": 1.7591421604156494,
       "learning_rate": 1.3237339420583213e-05,
-      "loss": 0.3214,
       "step": 35
     },
     {
-      "epoch": 1.3859649122807016,
-      "grad_norm": 1.8223237991333008,
       "learning_rate": 1.1353312997501313e-05,
-      "loss": 0.3087,
       "step": 40
     },
     {
-      "epoch": 1.5614035087719298,
-      "grad_norm": 1.5914912223815918,
       "learning_rate": 9.418551710895243e-06,
-      "loss": 0.2984,
       "step": 45
     },
     {
-      "epoch": 1.736842105263158,
-      "grad_norm": 1.5275812149047852,
       "learning_rate": 7.505588559420188e-06,
-      "loss": 0.2997,
       "step": 50
     },
     {
-      "epoch": 1.912280701754386,
-      "grad_norm": 1.5449497699737549,
       "learning_rate": 5.686139343187468e-06,
-      "loss": 0.2863,
       "step": 55
     },
     {
-      "epoch": 2.0701754385964914,
-      "grad_norm": 1.4619590044021606,
       "learning_rate": 4.028414082972141e-06,
-      "loss": 0.2595,
       "step": 60
     },
     {
-      "epoch": 2.245614035087719,
-      "grad_norm": 1.3976554870605469,
       "learning_rate": 2.594559868909956e-06,
-      "loss": 0.205,
       "step": 65
     },
     {
-      "epoch": 2.4210526315789473,
-      "grad_norm": 1.6166017055511475,
       "learning_rate": 1.4383310046973365e-06,
-      "loss": 0.2003,
       "step": 70
     },
     {
-      "epoch": 2.5964912280701755,
-      "grad_norm": 1.5817341804504395,
       "learning_rate": 6.030737921409169e-07,
-      "loss": 0.1876,
       "step": 75
     },
     {
-      "epoch": 2.7719298245614032,
-      "grad_norm": 1.4169119596481323,
       "learning_rate": 1.201015052319099e-07,
-      "loss": 0.1991,
       "step": 80
     },
     {
-      "epoch": 2.912280701754386,
       "step": 84,
-      "total_flos": 3.1760499590823936e+16,
-      "train_loss": 0.37719883024692535,
-      "train_runtime": 851.0638,
-      "train_samples_per_second": 6.43,
-      "train_steps_per_second": 0.099
     }
   ],
   "logging_steps": 5,
@@ -147,7 +147,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.1760499590823936e+16,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.9473684210526314,
   "eval_steps": 500,
   "global_step": 84,
   "is_hyper_param_search": false,
   "log_history": [
     {
       "epoch": 0.17543859649122806,
+      "grad_norm": 8.90099811553955,
       "learning_rate": 1.996992941167792e-05,
+      "loss": 1.1137,
       "step": 5
     },
     {
       "epoch": 0.3508771929824561,
+      "grad_norm": 2.5175933837890625,
       "learning_rate": 1.9633708786158803e-05,
+      "loss": 0.6094,
       "step": 10
     },
     {
       "epoch": 0.5263157894736842,
+      "grad_norm": 2.091320276260376,
       "learning_rate": 1.8936326403234125e-05,
+      "loss": 0.5038,
       "step": 15
     },
     {
       "epoch": 0.7017543859649122,
+      "grad_norm": 1.8124364614486694,
       "learning_rate": 1.7903926695187595e-05,
+      "loss": 0.484,
       "step": 20
     },
     {
       "epoch": 0.8771929824561403,
+      "grad_norm": 1.689386248588562,
       "learning_rate": 1.657521368569064e-05,
+      "loss": 0.4102,
       "step": 25
     },
     {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 1.687546730041504,
       "learning_rate": 1.5000000000000002e-05,
+      "loss": 0.3906,
       "step": 30
     },
     {
+      "epoch": 1.2280701754385965,
+      "grad_norm": 1.8159677982330322,
       "learning_rate": 1.3237339420583213e-05,
+      "loss": 0.3151,
       "step": 35
     },
     {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 1.6340556144714355,
       "learning_rate": 1.1353312997501313e-05,
+      "loss": 0.2973,
       "step": 40
     },
     {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 1.5886197090148926,
       "learning_rate": 9.418551710895243e-06,
+      "loss": 0.2915,
       "step": 45
     },
     {
+      "epoch": 1.7543859649122808,
+      "grad_norm": 1.5932583808898926,
       "learning_rate": 7.505588559420188e-06,
+      "loss": 0.2892,
       "step": 50
     },
     {
+      "epoch": 1.9298245614035088,
+      "grad_norm": 1.5423833131790161,
       "learning_rate": 5.686139343187468e-06,
+      "loss": 0.2769,
       "step": 55
     },
     {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 1.4709863662719727,
       "learning_rate": 4.028414082972141e-06,
+      "loss": 0.2353,
       "step": 60
     },
     {
+      "epoch": 2.280701754385965,
+      "grad_norm": 1.43242347240448,
       "learning_rate": 2.594559868909956e-06,
+      "loss": 0.1994,
       "step": 65
     },
     {
+      "epoch": 2.456140350877193,
+      "grad_norm": 1.5272055864334106,
       "learning_rate": 1.4383310046973365e-06,
+      "loss": 0.1864,
       "step": 70
     },
     {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 1.5854390859603882,
       "learning_rate": 6.030737921409169e-07,
+      "loss": 0.187,
       "step": 75
     },
     {
+      "epoch": 2.807017543859649,
+      "grad_norm": 1.4928103685379028,
       "learning_rate": 1.201015052319099e-07,
+      "loss": 0.1925,
       "step": 80
     },
     {
+      "epoch": 2.9473684210526314,
       "step": 84,
+      "total_flos": 3.214315621240013e+16,
+      "train_loss": 0.3641331955080941,
+      "train_runtime": 776.361,
+      "train_samples_per_second": 7.048,
+      "train_steps_per_second": 0.108
     }
   ],
   "logging_steps": 5,
       "attributes": {}
     }
   },
+  "total_flos": 3.214315621240013e+16,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:59f37abe75e212d61bc93af6a7d30d384fb5e3a575bc779bf07f5bada58d559f
-size 5624

 version https://git-lfs.github.com/spec/v1
+oid sha256:f97226cf1064ea048cf3dca3c9d390f0d7afd696e6425ee860d6d6fa14058046
+size 5496