diff --git a/Cantonese/config.json b/Cantonese/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c9a0ae235310bfb1b5e90297697c7a5cc5169f34 --- /dev/null +++ b/Cantonese/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 152 +} diff --git a/Cantonese/generation_config.json b/Cantonese/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Cantonese/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Cantonese/model.safetensors b/Cantonese/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..75059ba80789d147eb884b893ebd2ad0a97913b8 --- /dev/null +++ b/Cantonese/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80e5a76e879718c865ae642eac461076dbe1cd2c3245f386ffe6f9109d2a987 +size 3387304 diff --git a/Cantonese/special_tokens_map.json b/Cantonese/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Cantonese/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Cantonese/tokenizer.json b/Cantonese/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4740600ed1b2e8271f303caee92ec24bfed0ec39 --- /dev/null +++ b/Cantonese/tokenizer.json @@ -0,0 +1,269 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "WhitespaceSplit" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "j": 4, + "ɐ˥": 5, + "t": 6, + "k": 7, + "ɐu˧˥": 8, + "i˨": 9, + "n": 10, + "i˧˩̰": 11, + "y˨": 12, + "s": 13, + "ɐ˨": 14, + "p": 15, + "ts": 16, + "ɐu˥": 17, + "ɪ̞˧˥": 18, + "ŋ": 19, + "ɵ˧": 20, + "a̞˧": 21, + "l": 22, + "ʊ̟˥": 23, + "a̞˧˩̰": 24, + "ɛ˥": 25, + "ei˩˧": 26, + "w": 27, + "a̞˨": 28, + "ɐi˧˥": 29, + "a̞˧˥": 30, + "m̩˧˥": 31, + "m": 32, + "ou˥": 33, + "ei˥": 34, + "i˧": 35, + "ɔ̽˧˥": 36, + "tʰ": 37, + "i˥": 38, + "f": 39, + "aːĭ˧": 40, + "h": 41, + "ɵy˧": 42, + "a̞˥": 43, + "ei˧˩̰": 44, + "ou˨": 45, + "ɔ̽˧": 46, + "ɐi˧˩̰": 47, + "u˧": 48, + "ɔːĭ˥": 49, + "ɐu˨": 50, + "ei˧˥": 51, + "ɐi˨": 52, + "ʊ̟˧˩̰": 53, + "ʊ̟˨": 54, + "a̞˩˧": 55, + "ou˧˥": 56, + "aːĭ˧˥": 57, + "ɔ̽˨": 58, + "ɛ˩˧": 59, + "ɪ̞˨": 60, + "iːŭ˧": 61, + "ɛ˧˩̰": 62, + "m̩˧˩̰": 63, + "ɵ˧˥": 64, + "ei˧": 65, + "ɐu˧˩̰": 66, + "m̩˧": 67, + "ɐ˧˥": 68, + "ɐu˩˧": 69, + "ɐi˥": 70, + "ɔ̽˥": 71, + "ɔ̽˧˩̰": 72, + "ɔːĭ˧": 73, + "ou˩˧": 74, + "m̩˥": 75, + "ɐ˧": 76, + "tsʰ": 77, + "ɛ˧˥": 78, + "i˧˥": 79, + "ɔ̽˩˧": 80, + "kʰ": 81, + "ɐ˧˩̰": 82, + "aːŭ˧˥": 83, + "pʰ": 84, + "aːĭ˧˩̰": 85, + "ɵy˩˧": 86, + "ɛ˧": 87, + "u˧˥": 88, + "ɛ˨": 89, + "ʊ̟˧": 90, + "u˥": 91, + "m̩˩˧": 92, + "aːŭ˧": 93, + "œ̞˩˧": 94, + "i˩˧": 95, + "ɪ̞˧˩̰": 96, + "u˨": 97, + "ɪ̞˥": 98, + "iːŭ˧˩̰": 99, + "œ̞˧˥": 100, + "y˧": 101, + "uːĭ˩˧": 102, + "uːĭ˥": 103, + "ɵy˧˥": 104, + "y˧˩̰": 105, + "ɔːĭ˧˥": 106, + "ɛ": 107, + "ou˧": 108, + "ei˨": 109, + "ɵ˥": 110, + "u˧˩̰": 111, + "y˥": 112, + "œ̞˥": 113, + "œ̞˧˩̰": 114, + "aːĭ˨": 115, + "ɐ˩˧": 116, + "œ̞˧": 117, + "uːĭ˧˥": 118, + "ɐu˧": 119, + "ɐi˩˧": 120, + "ɐi˧": 121, + "ou˧˩̰": 122, + "aːĭ˥": 123, + "aːŭ˥": 124, + "ŋ˩˧": 125, + "y˧˥": 126, + "iːŭ˥": 127, + "ɔːĭ˨": 128, + "ʊ̟˧˥": 129, + "iːŭ˧˥": 130, + "ɵy˥": 131, + "ɔːĭ˧˩̰": 132, + "uːĭ˧": 133, + "ɵy˧˩̰": 134, + "œ̞˨": 135, + "m̩˨": 136, + "aːŭ˧˩̰": 137, + "y˩˧": 138, + "aːŭ˩˧": 139, + "aːĭ˩˧": 140, + "uːĭ˨": 141, + "ɵy˨": 142, + "aːŭ˨": 143, + "ɪ̞˧": 144, + "ɵ˨": 145, + "iːŭ˩˧": 146, + "iːŭ˨": 147, + "ɵ˧˩̰": 148, + "uːĭ˧˩̰": 149, + "u˩˧": 150, + "ŋ˧˩̰": 151 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Cantonese/tokenizer_config.json b/Cantonese/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Cantonese/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Cantonese/training_args.bin b/Cantonese/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f45a22c6c8a3085a8eda842a3f4d99e5f0a3805 --- /dev/null +++ b/Cantonese/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b843b2d3e207797e1396680aa7485a6a0bdcef4077450466fa42f9c477b48e +size 5368 diff --git a/Cantonese/vocab.json b/Cantonese/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..e1046969f94bc52e30222072b560ca895d699c9c --- /dev/null +++ b/Cantonese/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"j":4,"ɐ˥":5,"t":6,"k":7,"ɐu˧˥":8,"i˨":9,"n":10,"i˧˩̰":11,"y˨":12,"s":13,"ɐ˨":14,"p":15,"ts":16,"ɐu˥":17,"ɪ̞˧˥":18,"ŋ":19,"ɵ˧":20,"a̞˧":21,"l":22,"ʊ̟˥":23,"a̞˧˩̰":24,"ɛ˥":25,"ei˩˧":26,"w":27,"a̞˨":28,"ɐi˧˥":29,"a̞˧˥":30,"m̩˧˥":31,"m":32,"ou˥":33,"ei˥":34,"i˧":35,"ɔ̽˧˥":36,"tʰ":37,"i˥":38,"f":39,"aːĭ˧":40,"h":41,"ɵy˧":42,"a̞˥":43,"ei˧˩̰":44,"ou˨":45,"ɔ̽˧":46,"ɐi˧˩̰":47,"u˧":48,"ɔːĭ˥":49,"ɐu˨":50,"ei˧˥":51,"ɐi˨":52,"ʊ̟˧˩̰":53,"ʊ̟˨":54,"a̞˩˧":55,"ou˧˥":56,"aːĭ˧˥":57,"ɔ̽˨":58,"ɛ˩˧":59,"ɪ̞˨":60,"iːŭ˧":61,"ɛ˧˩̰":62,"m̩˧˩̰":63,"ɵ˧˥":64,"ei˧":65,"ɐu˧˩̰":66,"m̩˧":67,"ɐ˧˥":68,"ɐu˩˧":69,"ɐi˥":70,"ɔ̽˥":71,"ɔ̽˧˩̰":72,"ɔːĭ˧":73,"ou˩˧":74,"m̩˥":75,"ɐ˧":76,"tsʰ":77,"ɛ˧˥":78,"i˧˥":79,"ɔ̽˩˧":80,"kʰ":81,"ɐ˧˩̰":82,"aːŭ˧˥":83,"pʰ":84,"aːĭ˧˩̰":85,"ɵy˩˧":86,"ɛ˧":87,"u˧˥":88,"ɛ˨":89,"ʊ̟˧":90,"u˥":91,"m̩˩˧":92,"aːŭ˧":93,"œ̞˩˧":94,"i˩˧":95,"ɪ̞˧˩̰":96,"u˨":97,"ɪ̞˥":98,"iːŭ˧˩̰":99,"œ̞˧˥":100,"y˧":101,"uːĭ˩˧":102,"uːĭ˥":103,"ɵy˧˥":104,"y˧˩̰":105,"ɔːĭ˧˥":106,"ɛ":107,"ou˧":108,"ei˨":109,"ɵ˥":110,"u˧˩̰":111,"y˥":112,"œ̞˥":113,"œ̞˧˩̰":114,"aːĭ˨":115,"ɐ˩˧":116,"œ̞˧":117,"uːĭ˧˥":118,"ɐu˧":119,"ɐi˩˧":120,"ɐi˧":121,"ou˧˩̰":122,"aːĭ˥":123,"aːŭ˥":124,"ŋ˩˧":125,"y˧˥":126,"iːŭ˥":127,"ɔːĭ˨":128,"ʊ̟˧˥":129,"iːŭ˧˥":130,"ɵy˥":131,"ɔːĭ˧˩̰":132,"uːĭ˧":133,"ɵy˧˩̰":134,"œ̞˨":135,"m̩˨":136,"aːŭ˧˩̰":137,"y˩˧":138,"aːŭ˩˧":139,"aːĭ˩˧":140,"uːĭ˨":141,"ɵy˨":142,"aːŭ˨":143,"ɪ̞˧":144,"ɵ˨":145,"iːŭ˩˧":146,"iːŭ˨":147,"ɵ˧˩̰":148,"uːĭ˧˩̰":149,"u˩˧":150,"ŋ˧˩̰":151} \ No newline at end of file diff --git a/Dutch/config.json b/Dutch/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fcad897f909f0d55043f19ebae5277157fbff30d --- /dev/null +++ b/Dutch/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 50 +} diff --git a/Dutch/generation_config.json b/Dutch/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Dutch/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Dutch/model.safetensors b/Dutch/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8676143d8e263bbe8bce9b90985796af75347ce9 --- /dev/null +++ b/Dutch/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d997b37f113b5e7583ce581f013938c4cb1f2d94d081bc4d7d1cfca9f09b1acc +size 3335080 diff --git a/Dutch/special_tokens_map.json b/Dutch/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Dutch/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Dutch/tokenizer.json b/Dutch/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..8301a1f0cd6e7d4df60263df3ad8cf94f491363a --- /dev/null +++ b/Dutch/tokenizer.json @@ -0,0 +1,167 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "z": 4, + "oː": 5, + "j": 6, + "ãː": 7, + "ɦ": 8, + "ɾ": 9, + "d": 10, + "i": 11, + "ɛ": 12, + "p": 13, + "ɪ": 14, + "k": 15, + "ɑ": 16, + "l": 17, + "ɛː": 18, + "n": 19, + "s": 20, + "v": 21, + "ə": 22, + "ɛi": 23, + "ʋ": 24, + "t": 25, + "m": 26, + "ɣ": 27, + "ʏ": 28, + "ɔ": 29, + "x": 30, + "u": 31, + "f": 32, + "ŋ": 33, + "øː": 34, + "b": 35, + "ɔː": 36, + "ʌu": 37, + "y": 38, + "œy": 39, + "tʲ": 40, + "w": 41, + "ʃ": 42, + "t̠ʃ": 43, + "ɲ": 44, + "ʒ": 45, + "iː": 46, + "ɡ": 47, + "d̠ʒ": 48, + "ã": 49 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Dutch/tokenizer_config.json b/Dutch/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Dutch/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Dutch/training_args.bin b/Dutch/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a3ffdbd53fe2fae9c26dd1b54081f2f467a683f --- /dev/null +++ b/Dutch/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3079aa2abf5484337ccf8ff4e6bddef13c05a4e0558bf07f5790c65d85b7e185 +size 5368 diff --git a/Dutch/vocab.json b/Dutch/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..e32f88f0726392ee8f6b60a3baa5159263318ee0 --- /dev/null +++ b/Dutch/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"z":4,"oː":5,"j":6,"ãː":7,"ɦ":8,"ɾ":9,"d":10,"i":11,"ɛ":12,"p":13,"ɪ":14,"k":15,"ɑ":16,"l":17,"ɛː":18,"n":19,"s":20,"v":21,"ə":22,"ɛi":23,"ʋ":24,"t":25,"m":26,"ɣ":27,"ʏ":28,"ɔ":29,"x":30,"u":31,"f":32,"ŋ":33,"øː":34,"b":35,"ɔː":36,"ʌu":37,"y":38,"œy":39,"tʲ":40,"w":41,"ʃ":42,"t̠ʃ":43,"ɲ":44,"ʒ":45,"iː":46,"ɡ":47,"d̠ʒ":48,"ã":49} \ No newline at end of file diff --git a/EnglishNA/config.json b/EnglishNA/config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd60edb6379a51087a604b67f14f4773c7a53482 --- /dev/null +++ b/EnglishNA/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 47 +} diff --git a/EnglishNA/generation_config.json b/EnglishNA/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/EnglishNA/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/EnglishNA/model.safetensors b/EnglishNA/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d409f2f5830881f3616bc1ebf78e5c2c7fee9e33 --- /dev/null +++ b/EnglishNA/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:273fc2318bc0e4e81c4a414587514493ce5f67caf190c1c9cd3e8967a3eade00 +size 3333544 diff --git a/EnglishNA/special_tokens_map.json b/EnglishNA/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/EnglishNA/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/EnglishNA/tokenizer.json b/EnglishNA/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..98076b74d9141290fbe947eb0a0efefcfc5bc2f3 --- /dev/null +++ b/EnglishNA/tokenizer.json @@ -0,0 +1,164 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "d̠ʒ": 4, + "ʌ": 5, + "s": 6, + "t": 7, + "l": 8, + "aɪ": 9, + "k": 10, + "j": 11, + "ʊ": 12, + "ɹ": 13, + "b": 14, + "æ": 15, + "h": 16, + "oʊ": 17, + "m": 18, + "iː": 19, + "ð": 20, + "ɛ": 21, + "z": 22, + "f": 23, + "eɪ": 24, + "w": 25, + "ɪ": 26, + "ɡ": 27, + "ɑ": 28, + "ə": 29, + "p": 30, + "uː": 31, + "i": 32, + "θ": 33, + "ŋ": 34, + "ɔ": 35, + "ɔɪ": 36, + "n": 37, + "d": 38, + "aʊ": 39, + "v": 40, + "ɜː": 41, + "t̠ʃ": 42, + "ʃ": 43, + "iə": 44, + "ʒ": 45, + "x": 46 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/EnglishNA/tokenizer_config.json b/EnglishNA/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/EnglishNA/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/EnglishNA/training_args.bin b/EnglishNA/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..92ad05d6704606a237f4aded10bbaa0e8d4b43c5 --- /dev/null +++ b/EnglishNA/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ee949648c57d5e1171f9887784b0ac9d35ee1311228155f45271dc332aaa7df +size 5368 diff --git a/EnglishNA/vocab.json b/EnglishNA/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..346aae7d3e04813f9ede8bd9a963ffa93b42f59b --- /dev/null +++ b/EnglishNA/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"d̠ʒ":4,"ʌ":5,"s":6,"t":7,"l":8,"aɪ":9,"k":10,"j":11,"ʊ":12,"ɹ":13,"b":14,"æ":15,"h":16,"oʊ":17,"m":18,"iː":19,"ð":20,"ɛ":21,"z":22,"f":23,"eɪ":24,"w":25,"ɪ":26,"ɡ":27,"ɑ":28,"ə":29,"p":30,"uː":31,"i":32,"θ":33,"ŋ":34,"ɔ":35,"ɔɪ":36,"n":37,"d":38,"aʊ":39,"v":40,"ɜː":41,"t̠ʃ":42,"ʃ":43,"iə":44,"ʒ":45,"x":46} \ No newline at end of file diff --git a/EnglishUK/config.json b/EnglishUK/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5e9a90e5e72a69799610de90dbe89f5dc53295a --- /dev/null +++ b/EnglishUK/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 51 +} diff --git a/EnglishUK/generation_config.json b/EnglishUK/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/EnglishUK/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/EnglishUK/model.safetensors b/EnglishUK/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a2c261edc53f88397d058c03bea59030f66feda --- /dev/null +++ b/EnglishUK/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e4c9057fadb4c7c4b6f32215fb526998cb95f5fd1011bd83a683f6f8a103cca +size 3335592 diff --git a/EnglishUK/special_tokens_map.json b/EnglishUK/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/EnglishUK/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/EnglishUK/tokenizer.json b/EnglishUK/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..cc19c38fcc9ecbc578817f1093baceee41e44cb0 --- /dev/null +++ b/EnglishUK/tokenizer.json @@ -0,0 +1,168 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ð": 4, + "æ": 5, + "tʰ": 6, + "ɡ": 7, + "ʊ": 8, + "d": 9, + "ɑː": 10, + "l": 11, + "ɪ": 12, + "n": 13, + "eɪ": 14, + "t̠ʃ": 15, + "w": 16, + "ɒ": 17, + "ʌ": 18, + "z": 19, + "m": 20, + "iː": 21, + "aɪ": 22, + "h": 23, + "e": 24, + "kʰ": 25, + "s": 26, + "ə": 27, + "ɔː": 28, + "ɹ": 29, + "i": 30, + "əʊ": 31, + "uː": 32, + "j": 33, + "ɪə": 34, + "ɔɪ": 35, + "v": 36, + "f": 37, + "ɜː": 38, + "b": 39, + "pʰ": 40, + "d̠ʒ": 41, + "ɐ": 42, + "eə": 43, + "ʃ": 44, + "θ": 45, + "ŋ": 46, + "aʊ": 47, + "ʊə": 48, + "n̩": 49, + "ʒ": 50 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/EnglishUK/tokenizer_config.json b/EnglishUK/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/EnglishUK/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/EnglishUK/training_args.bin b/EnglishUK/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f98954f71d6bb5f99e45821bb65671ccdcd8a04 --- /dev/null +++ b/EnglishUK/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:383bf2e423c7742c010ffe46113f10f008fc82558484de49464279c0dcf24a23 +size 5368 diff --git a/EnglishUK/vocab.json b/EnglishUK/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..fa1afb7415edd0a141f1a813765ac30449e0a552 --- /dev/null +++ b/EnglishUK/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ð":4,"æ":5,"tʰ":6,"ɡ":7,"ʊ":8,"d":9,"ɑː":10,"l":11,"ɪ":12,"n":13,"eɪ":14,"t̠ʃ":15,"w":16,"ɒ":17,"ʌ":18,"z":19,"m":20,"iː":21,"aɪ":22,"h":23,"e":24,"kʰ":25,"s":26,"ə":27,"ɔː":28,"ɹ":29,"i":30,"əʊ":31,"uː":32,"j":33,"ɪə":34,"ɔɪ":35,"v":36,"f":37,"ɜː":38,"b":39,"pʰ":40,"d̠ʒ":41,"ɐ":42,"eə":43,"ʃ":44,"θ":45,"ŋ":46,"aʊ":47,"ʊə":48,"n̩":49,"ʒ":50} \ No newline at end of file diff --git a/Estonian/config.json b/Estonian/config.json new file mode 100644 index 0000000000000000000000000000000000000000..16a39b692835fd9add37526632d6855ff2d6f33d --- /dev/null +++ b/Estonian/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 68 +} diff --git a/Estonian/generation_config.json b/Estonian/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Estonian/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Estonian/model.safetensors b/Estonian/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a7e301b92451c163a0fa03318301d9d2155e801 --- /dev/null +++ b/Estonian/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afee2c8ad18b21e611292f658db3801ecafae03015168233fc9a039f2175e0c8 +size 3344296 diff --git a/Estonian/special_tokens_map.json b/Estonian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Estonian/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Estonian/tokenizer.json b/Estonian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..bacb89b3d32ff26618f7e63715ad993083909ff6 --- /dev/null +++ b/Estonian/tokenizer.json @@ -0,0 +1,185 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "n": 4, + "o": 5, + "t": 6, + "ʃ": 7, + "a": 8, + "uː": 9, + "m": 10, + "u": 11, + "tʲ": 12, + "i": 13, + "s": 14, + "eː": 15, + "d": 16, + "iː": 17, + "k": 18, + "ɡ": 19, + "ɑ": 20, + "ɤ": 21, + "ʊ": 22, + "sʲ": 23, + "j": 24, + "aː": 25, + "h": 26, + "v": 27, + "æi": 28, + "kː": 29, + "e": 30, + "ɪ": 31, + "tː": 32, + "r": 33, + "ɛ": 34, + "mː": 35, + "p": 36, + "sː": 37, + "æ": 38, + "l": 39, + "pː": 40, + "yː": 41, + "æː": 42, + "b": 43, + "ɔ": 44, + "ɤː": 45, + "lː": 46, + "ø": 47, + "øː": 48, + "ŋ": 49, + "y": 50, + "oː": 51, + "rː": 52, + "ɲ": 53, + "nː": 54, + "w": 55, + "tʲː": 56, + "øɪ̯": 57, + "f": 58, + "dʲ": 59, + "sʲː": 60, + "t̠ʃ": 61, + "ʃː": 62, + "ʒ": 63, + "z": 64, + "fː": 65, + "dː": 66, + "yi": 67 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Estonian/tokenizer_config.json b/Estonian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Estonian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Estonian/training_args.bin b/Estonian/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ce3b3a5ed6bf2f1a4fda251773a2a1c22a99b4b8 --- /dev/null +++ b/Estonian/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d97e5ae5878d9838daf01a6d58ddfa4f068f3912a2fe181a8f6bf2e3d1465a3 +size 5368 diff --git a/Estonian/vocab.json b/Estonian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..f1616a438e1ce75a34684c1c7c797df4df2762e9 --- /dev/null +++ b/Estonian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"n":4,"o":5,"t":6,"ʃ":7,"a":8,"uː":9,"m":10,"u":11,"tʲ":12,"i":13,"s":14,"eː":15,"d":16,"iː":17,"k":18,"ɡ":19,"ɑ":20,"ɤ":21,"ʊ":22,"sʲ":23,"j":24,"aː":25,"h":26,"v":27,"æi":28,"kː":29,"e":30,"ɪ":31,"tː":32,"r":33,"ɛ":34,"mː":35,"p":36,"sː":37,"æ":38,"l":39,"pː":40,"yː":41,"æː":42,"b":43,"ɔ":44,"ɤː":45,"lː":46,"ø":47,"øː":48,"ŋ":49,"y":50,"oː":51,"rː":52,"ɲ":53,"nː":54,"w":55,"tʲː":56,"øɪ̯":57,"f":58,"dʲ":59,"sʲː":60,"t̠ʃ":61,"ʃː":62,"ʒ":63,"z":64,"fː":65,"dː":66,"yi":67} \ No newline at end of file diff --git a/French/config.json b/French/config.json new file mode 100644 index 0000000000000000000000000000000000000000..719026e1b1488a4df4bfa34cd542831fb150db84 --- /dev/null +++ b/French/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 39 +} diff --git a/French/generation_config.json b/French/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/French/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/French/model.safetensors b/French/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e815047ff3b60f9e1ebe95d1a99398570c5aeb07 --- /dev/null +++ b/French/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6990ba97ea1ba903cc98ed8d71dcbf40cb7016357e45128a75f085d79837922a +size 3329448 diff --git a/French/special_tokens_map.json b/French/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/French/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/French/tokenizer.json b/French/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..8369a41339817f9ef945761844d45326b7f8500b --- /dev/null +++ b/French/tokenizer.json @@ -0,0 +1,156 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "m": 4, + "a": 5, + "ɑ̃": 6, + "d": 7, + "ɔ": 8, + "n": 9, + "b": 10, + "ʁ": 11, + "ə": 12, + "ɡ": 13, + "ʒ": 14, + "i": 15, + "v": 16, + "t": 17, + "k": 18, + "o": 19, + "ɛ̃": 20, + "w": 21, + "y": 22, + "j": 23, + "e": 24, + "ɔ̃": 25, + "p": 26, + "ɛ": 27, + "f": 28, + "s": 29, + "z": 30, + "l": 31, + "u": 32, + "ʃ": 33, + "œ": 34, + "ø": 35, + "ɲ": 36, + "t̠ʃ": 37, + "d̠ʒ": 38 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/French/tokenizer_config.json b/French/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/French/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/French/training_args.bin b/French/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5839052ec6eeebd23b5e0efb4d57c7cd190caccb --- /dev/null +++ b/French/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19db843d5235caa94bee0fa15a7d2f0b4c32bb2fdb0e11a2352df8221d9016b9 +size 5368 diff --git a/French/vocab.json b/French/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..2471ecaab8cd646a1139ff621f701ce8867685a9 --- /dev/null +++ b/French/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"m":4,"a":5,"ɑ̃":6,"d":7,"ɔ":8,"n":9,"b":10,"ʁ":11,"ə":12,"ɡ":13,"ʒ":14,"i":15,"v":16,"t":17,"k":18,"o":19,"ɛ̃":20,"w":21,"y":22,"j":23,"e":24,"ɔ̃":25,"p":26,"ɛ":27,"f":28,"s":29,"z":30,"l":31,"u":32,"ʃ":33,"œ":34,"ø":35,"ɲ":36,"t̠ʃ":37,"d̠ʒ":38} \ No newline at end of file diff --git a/German/config.json b/German/config.json new file mode 100644 index 0000000000000000000000000000000000000000..85e21f1051a823fedc981754cf859a3a5f4fc427 --- /dev/null +++ b/German/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 45 +} diff --git a/German/generation_config.json b/German/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/German/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/German/model.safetensors b/German/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4f24387b8cdef625fabf805f7cc8d80b546640d1 --- /dev/null +++ b/German/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f7ded997beed5dbd7f23492537240c1e32a67a9e425f84b31ed70436c3fd0aa +size 3332520 diff --git a/German/special_tokens_map.json b/German/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/German/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/German/tokenizer.json b/German/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..bb84b037c35c509567e630f5f2d95fcc3a4e8acc --- /dev/null +++ b/German/tokenizer.json @@ -0,0 +1,162 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "aː": 4, + "oː": 5, + "a": 6, + "b": 7, + "x": 8, + "v": 9, + "øː": 10, + "n": 11, + "ɛː": 12, + "f": 13, + "l": 14, + "iː": 15, + "yː": 16, + "j": 17, + "uː": 18, + "h": 19, + "ʊ": 20, + "m": 21, + "ɔ": 22, + "ɪ": 23, + "eː": 24, + "ə": 25, + "d̺": 26, + "t̺ʰ": 27, + "ɛ": 28, + "ŋ": 29, + "ç": 30, + "œ": 31, + "kʰ": 32, + "ʀ": 33, + "ɡ": 34, + "pʰ": 35, + "ʏ": 36, + "s": 37, + "z": 38, + "ts": 39, + "ʃ": 40, + "ɐ": 41, + "pf": 42, + "t̠ʃ": 43, + "d̠ʒ": 44 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/German/tokenizer_config.json b/German/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/German/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/German/training_args.bin b/German/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..94d3d73d6f1efbec284333893afa614957bb2ff7 --- /dev/null +++ b/German/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:008ba5e08b084dbc05c812b1e7e07e76ae16252dab079e1c24b493865397ef99 +size 5368 diff --git a/German/vocab.json b/German/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..9986a47024c76582fbfeb1af3b23d5c6084084f1 --- /dev/null +++ b/German/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"aː":4,"oː":5,"a":6,"b":7,"x":8,"v":9,"øː":10,"n":11,"ɛː":12,"f":13,"l":14,"iː":15,"yː":16,"j":17,"uː":18,"h":19,"ʊ":20,"m":21,"ɔ":22,"ɪ":23,"eː":24,"ə":25,"d̺":26,"t̺ʰ":27,"ɛ":28,"ŋ":29,"ç":30,"œ":31,"kʰ":32,"ʀ":33,"ɡ":34,"pʰ":35,"ʏ":36,"s":37,"z":38,"ts":39,"ʃ":40,"ɐ":41,"pf":42,"t̠ʃ":43,"d̠ʒ":44} \ No newline at end of file diff --git a/Indonesian/config.json b/Indonesian/config.json new file mode 100644 index 0000000000000000000000000000000000000000..02934c77ba0fb31105b3152d064fac039e8f71ed --- /dev/null +++ b/Indonesian/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 31 +} diff --git a/Indonesian/generation_config.json b/Indonesian/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Indonesian/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Indonesian/model.safetensors b/Indonesian/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9be489fe759361bb163330aaeb41b6e943a55eb --- /dev/null +++ b/Indonesian/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17f1c079a5a700fcbec9e67ba0cc9dea72c6f6e327c7e8c7c675121f52a8052 +size 3325352 diff --git a/Indonesian/special_tokens_map.json b/Indonesian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Indonesian/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Indonesian/tokenizer.json b/Indonesian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f47cee8f3938210bf990747730d1470d052cec1a --- /dev/null +++ b/Indonesian/tokenizer.json @@ -0,0 +1,148 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "s": 4, + "i": 5, + "n": 6, + "m": 7, + "a": 8, + "j": 9, + "u": 10, + "k": 11, + "o": 12, + "h": 13, + "l": 14, + "t": 15, + "w": 16, + "d̠ʒ": 17, + "ŋ": 18, + "ə": 19, + "d": 20, + "p": 21, + "ɡ": 22, + "b": 23, + "r": 24, + "ɲ": 25, + "t̠ʃ": 26, + "f": 27, + "z": 28, + "ʃ": 29, + "x": 30 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Indonesian/tokenizer_config.json b/Indonesian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Indonesian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Indonesian/training_args.bin b/Indonesian/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..55fe5638c9dbb558039c029dcdf0a7f2a4d42919 --- /dev/null +++ b/Indonesian/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caea86d724da6be35d5e9c7c445c61d52878a266b62c04b2bbe327289ab86046 +size 5368 diff --git a/Indonesian/vocab.json b/Indonesian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..8b94ac5a898bb0640a5ca1123876126507b5f5ca --- /dev/null +++ b/Indonesian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"s":4,"i":5,"n":6,"m":7,"a":8,"j":9,"u":10,"k":11,"o":12,"h":13,"l":14,"t":15,"w":16,"d̠ʒ":17,"ŋ":18,"ə":19,"d":20,"p":21,"ɡ":22,"b":23,"r":24,"ɲ":25,"t̠ʃ":26,"f":27,"z":28,"ʃ":29,"x":30} \ No newline at end of file diff --git a/Italian/config.json b/Italian/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bb213df66cd9025783af6a73e44e96f0c57e5f35 --- /dev/null +++ b/Italian/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 57 +} diff --git a/Italian/generation_config.json b/Italian/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Italian/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Italian/model.safetensors b/Italian/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0c83998a7a7e9318472bbd22e20c306c441ae78a --- /dev/null +++ b/Italian/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad24e994fea834a0fe4adcf51f44fae6242723d42206afe962ad62fd608bef3f +size 3338664 diff --git a/Italian/special_tokens_map.json b/Italian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Italian/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Italian/tokenizer.json b/Italian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..2840c8268845111e2326e21631ccef7b62599f94 --- /dev/null +++ b/Italian/tokenizer.json @@ -0,0 +1,174 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɛ": 4, + "kː": 5, + "o": 6, + "pː": 7, + "l": 8, + "ɐ": 9, + "n": 10, + "i": 11, + "m": 12, + "k": 13, + "s": 14, + "t": 15, + "ɔ": 16, + "z": 17, + "f": 18, + "v": 19, + "e": 20, + "d": 21, + "j": 22, + "t̠ʃ": 23, + "b": 24, + "w": 25, + "ɛː": 26, + "p": 27, + "r": 28, + "u": 29, + "ɡ": 30, + "ʎ": 31, + "d̠ʒ": 32, + "tː": 33, + "ɐː": 34, + "ts": 35, + "dː": 36, + "oː": 37, + "iː": 38, + "sː": 39, + "t̠ʃː": 40, + "ɾ": 41, + "eː": 42, + "dz": 43, + "bː": 44, + "d̠ʒː": 45, + "ɲ": 46, + "tsː": 47, + "ʃ": 48, + "a": 49, + "ɔː": 50, + "dzː": 51, + "ŋ": 52, + "h": 53, + "uː": 54, + "ɡː": 55, + "ʒ": 56 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Italian/tokenizer_config.json b/Italian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Italian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Italian/training_args.bin b/Italian/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd52a8104474947186bc662eed88b827986ab05f --- /dev/null +++ b/Italian/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6b77087cbc46af77ee000072dc89a05da4c6ae6bb826a3b4fb86fa89f5807d0 +size 5368 diff --git a/Italian/vocab.json b/Italian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..38956f9bd2a248f9d372e745325ea39cae92e8b5 --- /dev/null +++ b/Italian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɛ":4,"kː":5,"o":6,"pː":7,"l":8,"ɐ":9,"n":10,"i":11,"m":12,"k":13,"s":14,"t":15,"ɔ":16,"z":17,"f":18,"v":19,"e":20,"d":21,"j":22,"t̠ʃ":23,"b":24,"w":25,"ɛː":26,"p":27,"r":28,"u":29,"ɡ":30,"ʎ":31,"d̠ʒ":32,"tː":33,"ɐː":34,"ts":35,"dː":36,"oː":37,"iː":38,"sː":39,"t̠ʃː":40,"ɾ":41,"eː":42,"dz":43,"bː":44,"d̠ʒː":45,"ɲ":46,"tsː":47,"ʃ":48,"a":49,"ɔː":50,"dzː":51,"ŋ":52,"h":53,"uː":54,"ɡː":55,"ʒ":56} \ No newline at end of file diff --git a/Japanese/config.json b/Japanese/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0dd28349a2f1565d9b446e9b293b749625be192e --- /dev/null +++ b/Japanese/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 40 +} diff --git a/Japanese/generation_config.json b/Japanese/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Japanese/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Japanese/model.safetensors b/Japanese/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77e12fbd6069ab00fdde9f345d477e402ab37ebb --- /dev/null +++ b/Japanese/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b1c4f25cc349f566993ad0a319c30db3b3b3b12f993b8533bfdddec62f93c7 +size 3329960 diff --git a/Japanese/special_tokens_map.json b/Japanese/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Japanese/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Japanese/tokenizer.json b/Japanese/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..95858452a2c7bd2841da3e92ce16ef8b8f434396 --- /dev/null +++ b/Japanese/tokenizer.json @@ -0,0 +1,157 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "kʲ": 4, + "aː": 5, + "o": 6, + "ts": 7, + "ɯ": 8, + "k": 9, + "a": 10, + "i": 11, + "w": 12, + "d̠ʒ": 13, + "t": 14, + "e": 15, + "n": 16, + "ʃ": 17, + "d": 18, + "b": 19, + "s": 20, + "m": 21, + "h": 22, + "ɾ": 23, + "t̠ʃ": 24, + "ɯː": 25, + "p": 26, + "j": 27, + "ɡʲ": 28, + "ɸ": 29, + "ɡ": 30, + "oː": 31, + "ɲ": 32, + "z": 33, + "eː": 34, + "pʲ": 35, + "ɾʲ": 36, + "ç": 37, + "bʲ": 38, + "mʲ": 39 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Japanese/tokenizer_config.json b/Japanese/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Japanese/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Japanese/training_args.bin b/Japanese/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b50532652b01b5c31d7f579170c61a2feb428a83 --- /dev/null +++ b/Japanese/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6daa3a9b50f0b73a7be15ed11377d3597044b00dd21b79d84a223e2a9aed98a +size 5368 diff --git a/Japanese/vocab.json b/Japanese/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..9b568f9d25f492a50cb1cb57b6006e226177fc38 --- /dev/null +++ b/Japanese/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"kʲ":4,"aː":5,"o":6,"ts":7,"ɯ":8,"k":9,"a":10,"i":11,"w":12,"d̠ʒ":13,"t":14,"e":15,"n":16,"ʃ":17,"d":18,"b":19,"s":20,"m":21,"h":22,"ɾ":23,"t̠ʃ":24,"ɯː":25,"p":26,"j":27,"ɡʲ":28,"ɸ":29,"ɡ":30,"oː":31,"ɲ":32,"z":33,"eː":34,"pʲ":35,"ɾʲ":36,"ç":37,"bʲ":38,"mʲ":39} \ No newline at end of file diff --git a/Korean/config.json b/Korean/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a69528a3980249b70fe7463004897be2a018d282 --- /dev/null +++ b/Korean/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 33 +} diff --git a/Korean/generation_config.json b/Korean/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Korean/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Korean/model.safetensors b/Korean/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1a873b31c0fd7cbfb54a2d7786c73a6cbc483ae5 --- /dev/null +++ b/Korean/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6362d6cce27a805981a0243180b87e4a56cda38cc543a373f6f99bd66a16b9e +size 3326376 diff --git a/Korean/special_tokens_map.json b/Korean/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Korean/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Korean/tokenizer.json b/Korean/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4cd26b8cba5660e4bd31a22f64bde19546e0953c --- /dev/null +++ b/Korean/tokenizer.json @@ -0,0 +1,150 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "i": 4, + "ɾ": 5, + "ɯ": 6, + "m": 7, + "a": 8, + "u": 9, + "j": 10, + "ɤ̞": 11, + "ɡ": 12, + "ŋ": 13, + "h": 14, + "æ": 15, + "p": 16, + "o": 17, + "dʑ": 18, + "w": 19, + "n̪": 20, + "d": 21, + "e": 22, + "l": 23, + "t̠ʃ": 24, + "b": 25, + "s̪": 26, + "k": 27, + "t̪": 28, + "pʰ": 29, + "kʰ": 30, + "ɯi": 31, + "t̠ʃʰ": 32 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Korean/tokenizer_config.json b/Korean/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Korean/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Korean/training_args.bin b/Korean/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d5ddedca0f840eab809344732333335e4df93a7 --- /dev/null +++ b/Korean/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a566bf446b904582642ded3988dea8456e1003ff69e24caa212d669737c60ac1 +size 5368 diff --git a/Korean/vocab.json b/Korean/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..fb2494d3b5f743d94db034cf9e76469ac53a564d --- /dev/null +++ b/Korean/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"i":4,"ɾ":5,"ɯ":6,"m":7,"a":8,"u":9,"j":10,"ɤ̞":11,"ɡ":12,"ŋ":13,"h":14,"æ":15,"p":16,"o":17,"dʑ":18,"w":19,"n̪":20,"d":21,"e":22,"l":23,"t̠ʃ":24,"b":25,"s̪":26,"k":27,"t̪":28,"pʰ":29,"kʰ":30,"ɯi":31,"t̠ʃʰ":32} \ No newline at end of file diff --git a/Mandarin/config.json b/Mandarin/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5500fe75a808ed1e200723bc86c9b6c5e217293d --- /dev/null +++ b/Mandarin/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 115 +} diff --git a/Mandarin/generation_config.json b/Mandarin/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Mandarin/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Mandarin/model.safetensors b/Mandarin/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0882b9b619ae218be0aa6e06ed4e62c2d7d18423 --- /dev/null +++ b/Mandarin/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589e5db15ec746641aa8735c8725f363b0e712dd69c421c33115c04fa94c3e42 +size 3368360 diff --git a/Mandarin/special_tokens_map.json b/Mandarin/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Mandarin/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Mandarin/tokenizer.json b/Mandarin/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..bde39543251170a9ec815f9656e9ac15e296f0e1 --- /dev/null +++ b/Mandarin/tokenizer.json @@ -0,0 +1,232 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "WhitespaceSplit" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "a˧˥": 4, + "u˧˥": 5, + "a˥": 6, + "au": 7, + "n": 8, + "a˥˩": 9, + "ʃ̺": 10, + "ɻ̩˥˩": 11, + "ə˧˥": 12, + "m": 13, + "ɤ": 14, + "p": 15, + "j": 16, + "e˧˥": 17, + "kʰ": 18, + "k": 19, + "ɤ˥˩": 20, + "w": 21, + "o˥": 22, + "t̠ʃ̺ʰ": 23, + "ə˥": 24, + "ŋ": 25, + "t": 26, + "ʊ˥": 27, + "ɕ": 28, + "i": 29, + "a": 30, + "l": 31, + "au˧˩˧": 32, + "x": 33, + "u˧˩˧": 34, + "i˥": 35, + "ei˧˩˧": 36, + "pʰ": 37, + "i˧˥": 38, + "ai˧˥": 39, + "ou˧˩˧": 40, + "ɤ˧˥": 41, + "o˧˩˧": 42, + "tɕ": 43, + "au˥˩": 44, + "ts": 45, + "ə˧˩˧": 46, + "ɤ˥": 47, + "ei˧˥": 48, + "ʊ˧˥": 49, + "i˧˩˧": 50, + "t̠ʃ̺": 51, + "ɻ̩˧˩˧": 52, + "ei˥˩": 53, + "s": 54, + "u˥˩": 55, + "ɹ̪̩": 56, + "ai˥": 57, + "u˥": 58, + "tɕʰ": 59, + "a˧˩˧": 60, + "ai˥˩": 61, + "ɛ˥˩": 62, + "f": 63, + "i˥˩": 64, + "y˥˩": 65, + "au˧˥": 66, + "ɻ": 67, + "ou˥˩": 68, + "e˥": 69, + "tʰ": 70, + "ɹ̪̩˥˩": 71, + "ɛ˧˥": 72, + "au˥": 73, + "ou˧˥": 74, + "e˧˩˧": 75, + "ɛ˥": 76, + "ɻ̩˥": 77, + "ɥ": 78, + "ɹ̪̩˧˩˧": 79, + "ai˧˩˧": 80, + "ou˥": 81, + "o˥˩": 82, + "ɛ˧˩˧": 83, + "ʊ˧˩˧": 84, + "ɔ˥": 85, + "tsʰ": 86, + "ei": 87, + "ə˥˩": 88, + "o": 89, + "ʊ˥˩": 90, + "ou": 91, + "ɤ˧˩˧": 92, + "o˧˥": 93, + "ei˥": 94, + "e˥˩": 95, + "ɚ˧˩˧": 96, + "y˥": 97, + "ɚ˥˩": 98, + "y˧˥": 99, + "ɻ̩": 100, + "y˧˩˧": 101, + "ɹ̪̩˥": 102, + "ɻ̩˧˥": 103, + "u": 104, + "ə": 105, + "ai": 106, + "ʊ": 107, + "e": 108, + "ɚ˧˥": 109, + "ɔ˥˩": 110, + "ɹ̪̩˧˥": 111, + "ɛ": 112, + "y": 113, + "m˧˥": 114 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Mandarin/tokenizer_config.json b/Mandarin/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Mandarin/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Mandarin/training_args.bin b/Mandarin/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a246201ad0763d43a9eea5502069631b8719acaf --- /dev/null +++ b/Mandarin/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b76ec2a6c0efd8088246682b91c33930b79e5fe565b25a7c8c38239e90272c +size 5368 diff --git a/Mandarin/vocab.json b/Mandarin/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..6bb491736dd51d7b5d7477fa47e059950d7fdb25 --- /dev/null +++ b/Mandarin/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"a˧˥":4,"u˧˥":5,"a˥":6,"au":7,"n":8,"a˥˩":9,"ʃ̺":10,"ɻ̩˥˩":11,"ə˧˥":12,"m":13,"ɤ":14,"p":15,"j":16,"e˧˥":17,"kʰ":18,"k":19,"ɤ˥˩":20,"w":21,"o˥":22,"t̠ʃ̺ʰ":23,"ə˥":24,"ŋ":25,"t":26,"ʊ˥":27,"ɕ":28,"i":29,"a":30,"l":31,"au˧˩˧":32,"x":33,"u˧˩˧":34,"i˥":35,"ei˧˩˧":36,"pʰ":37,"i˧˥":38,"ai˧˥":39,"ou˧˩˧":40,"ɤ˧˥":41,"o˧˩˧":42,"tɕ":43,"au˥˩":44,"ts":45,"ə˧˩˧":46,"ɤ˥":47,"ei˧˥":48,"ʊ˧˥":49,"i˧˩˧":50,"t̠ʃ̺":51,"ɻ̩˧˩˧":52,"ei˥˩":53,"s":54,"u˥˩":55,"ɹ̪̩":56,"ai˥":57,"u˥":58,"tɕʰ":59,"a˧˩˧":60,"ai˥˩":61,"ɛ˥˩":62,"f":63,"i˥˩":64,"y˥˩":65,"au˧˥":66,"ɻ":67,"ou˥˩":68,"e˥":69,"tʰ":70,"ɹ̪̩˥˩":71,"ɛ˧˥":72,"au˥":73,"ou˧˥":74,"e˧˩˧":75,"ɛ˥":76,"ɻ̩˥":77,"ɥ":78,"ɹ̪̩˧˩˧":79,"ai˧˩˧":80,"ou˥":81,"o˥˩":82,"ɛ˧˩˧":83,"ʊ˧˩˧":84,"ɔ˥":85,"tsʰ":86,"ei":87,"ə˥˩":88,"o":89,"ʊ˥˩":90,"ou":91,"ɤ˧˩˧":92,"o˧˥":93,"ei˥":94,"e˥˩":95,"ɚ˧˩˧":96,"y˥":97,"ɚ˥˩":98,"y˧˥":99,"ɻ̩":100,"y˧˩˧":101,"ɹ̪̩˥":102,"ɻ̩˧˥":103,"u":104,"ə":105,"ai":106,"ʊ":107,"e":108,"ɚ˧˥":109,"ɔ˥˩":110,"ɹ̪̩˧˥":111,"ɛ":112,"y":113,"m˧˥":114} \ No newline at end of file diff --git a/Polish/config.json b/Polish/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bc91a25ebc87a9faf03ac3b9d2967a5ade626d99 --- /dev/null +++ b/Polish/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 43 +} diff --git a/Polish/generation_config.json b/Polish/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Polish/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Polish/model.safetensors b/Polish/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..69cf12b20cac0cd740bf035da8dc76339e0bd348 --- /dev/null +++ b/Polish/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b20459771bb1fee87bb3b5804f7a95ad2a617414eb0a062b7daba7b517ad42b6 +size 3331496 diff --git a/Polish/special_tokens_map.json b/Polish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Polish/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Polish/tokenizer.json b/Polish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ae540657a817beb82468694a9cc16d71be0b355 --- /dev/null +++ b/Polish/tokenizer.json @@ -0,0 +1,160 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "e": 4, + "d̪": 5, + "l̪": 6, + "v": 7, + "o": 8, + "w": 9, + "a": 10, + "j": 11, + "b": 12, + "r": 13, + "ɲ": 14, + "i": 15, + "ɕ": 16, + "u": 17, + "x": 18, + "tɕ": 19, + "t̪": 20, + "k": 21, + "p": 22, + "ɨ": 23, + "dʑ": 24, + "z̪": 25, + "n̪": 26, + "f": 27, + "ʑ": 28, + "m": 29, + "z̻": 30, + "s̻": 31, + "t̻s̻": 32, + "t̪s̪": 33, + "ɡ": 34, + "s̪": 35, + "ŋ": 36, + "kʲ": 37, + "t": 38, + "ɡʲ": 39, + "ɣ": 40, + "ẽ": 41, + "d̻z̻": 42 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Polish/tokenizer_config.json b/Polish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Polish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Polish/training_args.bin b/Polish/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1dc0478dcf7a4bd1d9a3f83abde379287a58df8 --- /dev/null +++ b/Polish/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5534f70b378c3a6ee4670c511f22cf318da7496ec6736c2f59fb6c73a2387a51 +size 5368 diff --git a/Polish/vocab.json b/Polish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..87beef69d550700a0998c1d79f275489c7e907e6 --- /dev/null +++ b/Polish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"e":4,"d̪":5,"l̪":6,"v":7,"o":8,"w":9,"a":10,"j":11,"b":12,"r":13,"ɲ":14,"i":15,"ɕ":16,"u":17,"x":18,"tɕ":19,"t̪":20,"k":21,"p":22,"ɨ":23,"dʑ":24,"z̪":25,"n̪":26,"f":27,"ʑ":28,"m":29,"z̻":30,"s̻":31,"t̻s̻":32,"t̪s̪":33,"ɡ":34,"s̪":35,"ŋ":36,"kʲ":37,"t":38,"ɡʲ":39,"ɣ":40,"ẽ":41,"d̻z̻":42} \ No newline at end of file diff --git a/PortuguesePt/config.json b/PortuguesePt/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a39e6f85aaa626562f4bc56bedabe282df30e1cd --- /dev/null +++ b/PortuguesePt/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 52 +} diff --git a/PortuguesePt/generation_config.json b/PortuguesePt/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/PortuguesePt/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/PortuguesePt/model.safetensors b/PortuguesePt/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0e2a2b50c8c8ea24f93518a2bb1731a0187bffd2 --- /dev/null +++ b/PortuguesePt/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:012956c7ec488f456d963933319a8a4be4ede8de667ea6101358bc5f5bdbded7 +size 3336104 diff --git a/PortuguesePt/special_tokens_map.json b/PortuguesePt/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/PortuguesePt/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/PortuguesePt/tokenizer.json b/PortuguesePt/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..54a51047797a2f7be790018ae09fe210bf194437 --- /dev/null +++ b/PortuguesePt/tokenizer.json @@ -0,0 +1,169 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɔ": 4, + "l̪ˠ": 5, + "a": 6, + "p": 7, + "ɐ": 8, + "i": 9, + "n̪": 10, + "e": 11, + "ʃ": 12, + "f": 13, + "ɾ": 14, + "ɐ̃": 15, + "d̪": 16, + "m": 17, + "ʒ": 18, + "b": 19, + "ɯ": 20, + "ɛ": 21, + "ɐ̃i": 22, + "ʁ": 23, + "t̪": 24, + "s": 25, + "o": 26, + "ɐ̃u̜": 27, + "ũ": 28, + "ɡ": 29, + "u": 30, + "k": 31, + "z": 32, + "au̜": 33, + "ai": 34, + "eu̜": 35, + "ɐi": 36, + "ɲ": 37, + "ɛu̜": 38, + "ĩ": 39, + "ũi": 40, + "ɔi": 41, + "õ": 42, + "õi": 43, + "ẽ": 44, + "v": 45, + "oi": 46, + "ʎ": 47, + "iu̜": 48, + "ui": 49, + "ɛi": 50, + "ts": 51 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/PortuguesePt/tokenizer_config.json b/PortuguesePt/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/PortuguesePt/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/PortuguesePt/training_args.bin b/PortuguesePt/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4a713bfe676ac2d29f6fa119f29193475d97d4b --- /dev/null +++ b/PortuguesePt/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3ba98592f042350115cf216a8487d48f62a3490b33c6075c68524c95af0b772 +size 5368 diff --git a/PortuguesePt/vocab.json b/PortuguesePt/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..eed8073e2f1cb4ec378ba0dbe3cc8222008fc393 --- /dev/null +++ b/PortuguesePt/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɔ":4,"l̪ˠ":5,"a":6,"p":7,"ɐ":8,"i":9,"n̪":10,"e":11,"ʃ":12,"f":13,"ɾ":14,"ɐ̃":15,"d̪":16,"m":17,"ʒ":18,"b":19,"ɯ":20,"ɛ":21,"ɐ̃i":22,"ʁ":23,"t̪":24,"s":25,"o":26,"ɐ̃u̜":27,"ũ":28,"ɡ":29,"u":30,"k":31,"z":32,"au̜":33,"ai":34,"eu̜":35,"ɐi":36,"ɲ":37,"ɛu̜":38,"ĩ":39,"ũi":40,"ɔi":41,"õ":42,"õi":43,"ẽ":44,"v":45,"oi":46,"ʎ":47,"iu̜":48,"ui":49,"ɛi":50,"ts":51} \ No newline at end of file diff --git a/Serbian/config.json b/Serbian/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6af98c16dd0b822e565f891674a63ea83f147ee --- /dev/null +++ b/Serbian/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 34 +} diff --git a/Serbian/generation_config.json b/Serbian/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Serbian/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Serbian/model.safetensors b/Serbian/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7b55513aaab6b07ccd4b932ddfeaf8345103bca2 --- /dev/null +++ b/Serbian/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aac66bb376ce336eb427227eb6cd3312182d4eee39d3ef842de186d46f1157f +size 3326888 diff --git a/Serbian/special_tokens_map.json b/Serbian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Serbian/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Serbian/tokenizer.json b/Serbian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..3534dcd8119733ce1c710c573f85d49def54ef3c --- /dev/null +++ b/Serbian/tokenizer.json @@ -0,0 +1,151 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "j": 4, + "e̞": 5, + "s̪̻": 6, + "t̪̻": 7, + "u": 8, + "l": 9, + "o̞": 10, + "ʒ̺": 11, + "i": 12, + "ʋ": 13, + "d̪̻": 14, + "ä": 15, + "m": 16, + "n": 17, + "r": 18, + "k": 19, + "t̪̻s̪̻": 20, + "p": 21, + "ʃ̺": 22, + "x": 23, + "b": 24, + "ɡ": 25, + "t̻ʃ̻": 26, + "f": 27, + "z̪̻": 28, + "ɲ": 29, + "ʎ": 30, + "d̻ʒ̻": 31, + "y": 32, + "w": 33 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Serbian/tokenizer_config.json b/Serbian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Serbian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Serbian/training_args.bin b/Serbian/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..566ea99b61832d7bd63378e8f1616a3d20e09076 --- /dev/null +++ b/Serbian/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488c8febbfaa14ca01f17933b9d429ab6d5c40266f269b01a822ac6ebdfd8ac3 +size 5368 diff --git a/Serbian/vocab.json b/Serbian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..147646ea2e4397f80112d624f240c9167cd81010 --- /dev/null +++ b/Serbian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"j":4,"e̞":5,"s̪̻":6,"t̪̻":7,"u":8,"l":9,"o̞":10,"ʒ̺":11,"i":12,"ʋ":13,"d̪̻":14,"ä":15,"m":16,"n":17,"r":18,"k":19,"t̪̻s̪̻":20,"p":21,"ʃ̺":22,"x":23,"b":24,"ɡ":25,"t̻ʃ̻":26,"f":27,"z̪̻":28,"ɲ":29,"ʎ":30,"d̻ʒ̻":31,"y":32,"w":33} \ No newline at end of file diff --git a/Spanish/config.json b/Spanish/config.json new file mode 100644 index 0000000000000000000000000000000000000000..02934c77ba0fb31105b3152d064fac039e8f71ed --- /dev/null +++ b/Spanish/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 31 +} diff --git a/Spanish/generation_config.json b/Spanish/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Spanish/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Spanish/model.safetensors b/Spanish/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..69d499cf6380fae6d5fed8eda184f6871ce61b83 --- /dev/null +++ b/Spanish/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f63964da519ca1830b2c997baf181b8d8e4dea738bf8b340fe9d663535628fad +size 3325352 diff --git a/Spanish/special_tokens_map.json b/Spanish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Spanish/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Spanish/tokenizer.json b/Spanish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d78f791ada70b614356e12438ea827961ac57982 --- /dev/null +++ b/Spanish/tokenizer.json @@ -0,0 +1,148 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "a": 4, + "i": 5, + "ɾ": 6, + "e̞": 7, + "n": 8, + "k": 9, + "ɲ": 10, + "o̞": 11, + "m": 12, + "s": 13, + "u": 14, + "p": 15, + "d": 16, + "l": 17, + "t": 18, + "β": 19, + "ɡ": 20, + "w": 21, + "ʝ": 22, + "f": 23, + "x": 24, + "j": 25, + "r": 26, + "t̠ʃ": 27, + "ʃ": 28, + "tl": 29, + "ts": 30 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Spanish/tokenizer_config.json b/Spanish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Spanish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Spanish/training_args.bin b/Spanish/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a70902103cf3a8e2e772e77a37dbe46375572398 --- /dev/null +++ b/Spanish/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8d4adefb1dd558e2a83f66390c0fd56f737f80c8bcf7b156f378eb07128831 +size 5368 diff --git a/Spanish/vocab.json b/Spanish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..e63939d763931eed0ff9e6d10ba29c58a4c91b81 --- /dev/null +++ b/Spanish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"a":4,"i":5,"ɾ":6,"e̞":7,"n":8,"k":9,"ɲ":10,"o̞":11,"m":12,"s":13,"u":14,"p":15,"d":16,"l":17,"t":18,"β":19,"ɡ":20,"w":21,"ʝ":22,"f":23,"x":24,"j":25,"r":26,"t̠ʃ":27,"ʃ":28,"tl":29,"ts":30} \ No newline at end of file diff --git a/Swedish/config.json b/Swedish/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bc91a25ebc87a9faf03ac3b9d2967a5ade626d99 --- /dev/null +++ b/Swedish/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.3, + "bos_token_id": 3, + "embd_pdrop": 0.3, + "eos_token_id": 3, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_embd": 128, + "n_head": 4, + "n_inner": 512, + "n_layer": 4, + "n_positions": 256, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.3, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.44.2", + "use_cache": true, + "vocab_size": 43 +} diff --git a/Swedish/generation_config.json b/Swedish/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da394dafe81774e005bdc3d91619d2cdf6b7d30e --- /dev/null +++ b/Swedish/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 3, + "eos_token_id": 3, + "transformers_version": "4.44.2" +} diff --git a/Swedish/model.safetensors b/Swedish/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba7780df01d8934d47fa918dea86538e6c2134e3 --- /dev/null +++ b/Swedish/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43007a93e331ba4632a9909d9a4d27a57fb238b19c084d4defd91dc3dc08c2bd +size 3331496 diff --git a/Swedish/special_tokens_map.json b/Swedish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd0a22b2fa08a7e8c8a0290d469c6c21da8f813 --- /dev/null +++ b/Swedish/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/Swedish/tokenizer.json b/Swedish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..91dfc8d529da584e2e51e3f4aff51c09d488e872 --- /dev/null +++ b/Swedish/tokenizer.json @@ -0,0 +1,160 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɔ": 4, + "ʝ": 5, + "k": 6, + "l": 7, + "ɛ": 8, + "m": 9, + "d̪": 10, + "e": 11, + "ʉ̟": 12, + "f": 13, + "ɪ": 14, + "ŋ": 15, + "ɹ": 16, + "a": 17, + "n̪": 18, + "iː": 19, + "ɑː": 20, + "ɛː": 21, + "t̪": 22, + "s̪": 23, + "v": 24, + "oː": 25, + "uː": 26, + "eː": 27, + "ʊ": 28, + "p": 29, + "b": 30, + "h": 31, + "øː": 32, + "yː": 33, + "ʂ": 34, + "ɡ": 35, + "ɵ": 36, + "ʃ": 37, + "œ": 38, + "ɕ": 39, + "ʏ": 40, + "ɧ": 41, + "z": 42 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Swedish/tokenizer_config.json b/Swedish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Swedish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Swedish/training_args.bin b/Swedish/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..099a9e9425379ca6ee0b0ec72c62a30d9f86384f --- /dev/null +++ b/Swedish/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d57029162f8f3ff9d67a8c89c74a50709588ee2408e731f3f49f2a4afdbee4a +size 5368 diff --git a/Swedish/vocab.json b/Swedish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..373a50086871da331904483c3445e41128b1d40f --- /dev/null +++ b/Swedish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɔ":4,"ʝ":5,"k":6,"l":7,"ɛ":8,"m":9,"d̪":10,"e":11,"ʉ̟":12,"f":13,"ɪ":14,"ŋ":15,"ɹ":16,"a":17,"n̪":18,"iː":19,"ɑː":20,"ɛː":21,"t̪":22,"s̪":23,"v":24,"oː":25,"uː":26,"eː":27,"ʊ":28,"p":29,"b":30,"h":31,"øː":32,"yː":33,"ʂ":34,"ɡ":35,"ɵ":36,"ʃ":37,"œ":38,"ɕ":39,"ʏ":40,"ɧ":41,"z":42} \ No newline at end of file