Training in progress, step 20000

Browse files

Files changed (11) hide show

.gitignore +1 -0
babyslm/lexical.txt +0 -0
babyslm/syntactic.txt +0 -0
config.json +31 -0
hydra_config_1736968159.8744686.yaml +45 -0
model.safetensors +3 -0
special_tokens_map.json +30 -0
tokenizer.json +171 -0
tokenizer_config.json +44 -0
training_args.bin +3 -0
vocab.json +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint-*/

babyslm/lexical.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

babyslm/syntactic.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.3,
+  "bos_token_id": 3,
+  "embd_pdrop": 0.3,
+  "eos_token_id": 3,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 128,
+  "n_head": 4,
+  "n_inner": 512,
+  "n_layer": 2,
+  "n_positions": 256,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.3,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "vocab_size": 47
+}

hydra_config_1736968159.8744686.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+experiment:
+  seed: 42
+  name: gpt2_400k-full-03
+  group: childes-size-english
+  dry_run: false
+  offline_run: false
+  evaluate_segmentation: true
+  evaluate_babyslm: true
+  blimp_tasks: null
+  resume_checkpoint_path: null
+  resume_run_id: null
+dataset:
+  name: phonemetransformers/CHILDES
+  subconfig: English
+  text_column: phonemized_utterance
+  is_phonemes: true
+  max_age: 120
+  remove_child_utterances: true
+tokenizer:
+  name: phonemetransformers/CHILDES-English-phoneme-tokenizer
+data_preprocessing:
+  max_input_length: 128
+  join_utts: static
+  remove_word_boundaries: true
+  subsample: null
+  subsample_type: tokens
+model:
+  name: gpt2_lm
+  model_kwargs:
+    n_layer: 2
+    n_head: 4
+    n_embd: 128
+    n_positions: 256
+    n_inner: 512
+    resid_pdrop: 0.3
+    embd_pdrop: 0.3
+    attn_pdrop: 0.3
+trainer:
+  batch_size: 32
+  lr: 0.001
+  num_warmup_steps: 60000
+  max_training_steps: 200000
+  logging_steps: 2000
+  save_steps: 20000
+  eval_steps: 20000

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f99581e77eb40867c9e5620c3aeb88a5f7318476d053fb5992c79fa35ad13112
+size 1745032

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "UTT_BOUNDARY",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "UTT_BOUNDARY",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "PAD",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "UNK",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": {
+    "strategy": "BatchLongest",
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 1,
+    "pad_type_id": 0,
+    "pad_token": "PAD"
+  },
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "UNK",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "PAD",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "WORD_BOUNDARY",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "UTT_BOUNDARY",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "Strip",
+        "strip_left": true,
+        "strip_right": true
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "UTT_BOUNDARY",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "UTT_BOUNDARY",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "UTT_BOUNDARY",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "UTT_BOUNDARY": {
+        "id": "UTT_BOUNDARY",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "UTT_BOUNDARY"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "UNK": 0,
+      "PAD": 1,
+      "WORD_BOUNDARY": 2,
+      "UTT_BOUNDARY": 3,
+      "d̠ʒ": 4,
+      "ʌ": 5,
+      "s": 6,
+      "t": 7,
+      "l": 8,
+      "aɪ": 9,
+      "k": 10,
+      "j": 11,
+      "ʊ": 12,
+      "ɹ": 13,
+      "b": 14,
+      "æ": 15,
+      "h": 16,
+      "oʊ": 17,
+      "m": 18,
+      "iː": 19,
+      "ð": 20,
+      "ɛ": 21,
+      "z": 22,
+      "f": 23,
+      "eɪ": 24,
+      "w": 25,
+      "ɪ": 26,
+      "ɡ": 27,
+      "ɑ": 28,
+      "ə": 29,
+      "p": 30,
+      "uː": 31,
+      "i": 32,
+      "θ": 33,
+      "ŋ": 34,
+      "ɔ": 35,
+      "ɔɪ": 36,
+      "n": 37,
+      "d": 38,
+      "aʊ": 39,
+      "v": 40,
+      "ɜː": 41,
+      "t̠ʃ": 42,
+      "ʃ": 43,
+      "iə": 44,
+      "ʒ": 45,
+      "x": 46
+    },
+    "unk_token": "UNK"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "UNK",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "PAD",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "WORD_BOUNDARY",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "UTT_BOUNDARY",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "UTT_BOUNDARY",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "UTT_BOUNDARY",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "PAD",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "UNK"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50a525aaa9964ccd74835944c8e0295a4577f3674198ad5fe556036fbda748b7
+size 5368

vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"d̠ʒ":4,"ʌ":5,"s":6,"t":7,"l":8,"aɪ":9,"k":10,"j":11,"ʊ":12,"ɹ":13,"b":14,"æ":15,"h":16,"oʊ":17,"m":18,"iː":19,"ð":20,"ɛ":21,"z":22,"f":23,"eɪ":24,"w":25,"ɪ":26,"ɡ":27,"ɑ":28,"ə":29,"p":30,"uː":31,"i":32,"θ":33,"ŋ":34,"ɔ":35,"ɔɪ":36,"n":37,"d":38,"aʊ":39,"v":40,"ɜː":41,"t̠ʃ":42,"ʃ":43,"iə":44,"ʒ":45,"x":46}