kainghour commited on
Commit
ec6963f
·
verified ·
1 Parent(s): 088686d

Upload 7 files

Browse files
README.md CHANGED
@@ -1,3 +1,21 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language: km
4
+ ---
5
+
6
+ # PrahokBART (big-sized model)
7
+
8
+ PrahokBART is a pre-trained sequence-to-sequence model trained from scratch for Khmer using carefully curated Khmer and English corpora. This model was trained considering the linguistic issues of Khmer by incorporating linguistic components such as word segmentation and normalization. This model can be finetuned to build natural language generation application for Khmer such as English<->Khmer translation, summarization, headline generation, etc. This model is more efficient than mBART50. You can read more about PrahokBART in this [paper](https://aclanthology.org/2025.coling-main.87/).
9
+
10
+ Finetuning codes are avaiable in [GitHub](https://github.com/hour/prahokbart).
11
+
12
+ # Citation
13
+ ```bibtex
14
+ @inproceedings{kaing2025prahokbart,
15
+ title={PrahokBART: A Pre-trained Sequence-to-Sequence Model for Khmer Natural Language Generation},
16
+ author={Kaing, Hour and Dabre, Raj and Song, Haiyue and Tran, Van-Hien and Tanaka, Hideki and Utiyama, Masao},
17
+ booktitle={Proceedings of the 31st International Conference on Computational Linguistics},
18
+ pages={1309--1322},
19
+ year={2025}
20
+ }
21
+ ```
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 32000, "</s>": 32001, "<2km>": 32002, "<2en>": 32003}
config.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "activation_function": "gelu",
4
+ "adaptor_activation_function": "gelu",
5
+ "adaptor_dropout": 0.1,
6
+ "adaptor_hidden_size": 512,
7
+ "adaptor_init_std": 0.02,
8
+ "adaptor_scaling_factor": 1.0,
9
+ "adaptor_tuning": false,
10
+ "additional_source_wait_k": -1,
11
+ "alibi_encoding": false,
12
+ "architectures": [
13
+ "MBartForConditionalGeneration"
14
+ ],
15
+ "asymmetric_alibi_encoding": false,
16
+ "attention_dropout": 0.1,
17
+ "bos_token_id": 32000,
18
+ "bottleneck_mid_fusion_tokens": 4,
19
+ "classifier_dropout": 0.0,
20
+ "d_model": 1024,
21
+ "decoder_adaptor_tying_config": null,
22
+ "decoder_attention_heads": 16,
23
+ "decoder_ffn_dim": 4096,
24
+ "decoder_layerdrop": 0.0,
25
+ "decoder_layers": 6,
26
+ "decoder_tying_config": null,
27
+ "deep_adaptor_tuning": false,
28
+ "deep_adaptor_tuning_ffn_only": false,
29
+ "dropout": 0.1,
30
+ "embed_low_rank_dim": 0,
31
+ "encoder_adaptor_tying_config": null,
32
+ "encoder_attention_heads": 16,
33
+ "encoder_ffn_dim": 4096,
34
+ "encoder_layerdrop": 0.0,
35
+ "encoder_layers": 6,
36
+ "encoder_tying_config": null,
37
+ "eos_token_id": 32001,
38
+ "expert_ffn_size": 128,
39
+ "features_embed_dims": null,
40
+ "features_vocab_sizes": null,
41
+ "gradient_checkpointing": false,
42
+ "gradient_reversal_for_domain_classifier": false,
43
+ "hypercomplex": false,
44
+ "hypercomplex_n": 2,
45
+ "ia3_adaptors": false,
46
+ "init_std": 0.02,
47
+ "initialization_scheme": "static",
48
+ "is_encoder_decoder": true,
49
+ "layernorm_adaptor_input": false,
50
+ "layernorm_prompt_projection": false,
51
+ "lora_adaptor_rank": 2,
52
+ "lora_adaptors": false,
53
+ "max_position_embeddings": 1024,
54
+ "mid_fusion_layers": 3,
55
+ "model_type": "mbart",
56
+ "moe_adaptors": false,
57
+ "multi_source": false,
58
+ "multi_source_method": null,
59
+ "multilayer_softmaxing": null,
60
+ "no_embed_norm": false,
61
+ "no_positional_encoding_decoder": false,
62
+ "no_positional_encoding_encoder": false,
63
+ "no_projection_prompt": false,
64
+ "no_scale_attention_embedding": false,
65
+ "num_domains_for_domain_classifier": 1,
66
+ "num_experts": 8,
67
+ "num_hidden_layers": 6,
68
+ "num_moe_adaptor_experts": 4,
69
+ "num_prompts": 100,
70
+ "num_sparsify_blocks": 8,
71
+ "pad_token_id": 0,
72
+ "parallel_adaptors": false,
73
+ "positional_encodings": false,
74
+ "postnorm_decoder": false,
75
+ "postnorm_encoder": false,
76
+ "prompt_dropout": 0.1,
77
+ "prompt_init_std": 0.02,
78
+ "prompt_projection_hidden_size": 4096,
79
+ "prompt_tuning": false,
80
+ "recurrent_projections": 1,
81
+ "residual_connection_adaptor": false,
82
+ "residual_connection_prompt": false,
83
+ "rope_encoding": false,
84
+ "scale_embedding": false,
85
+ "softmax_bias_tuning": false,
86
+ "softmax_temperature": 1.0,
87
+ "sparsification_temperature": 3.0,
88
+ "sparsify_attention": false,
89
+ "sparsify_ffn": false,
90
+ "target_vocab_size": 0,
91
+ "temperature_calibration": false,
92
+ "tokenizer_class": "AlbertTokenizer",
93
+ "transformers_version": "4.3.2",
94
+ "unidirectional_encoder": false,
95
+ "use_cache": true,
96
+ "use_moe": false,
97
+ "use_tanh_activation_prompt": false,
98
+ "vocab_size": 32004,
99
+ "wait_k": -1
100
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49ea52f2a75af4c0ac05bb98b94e6c07d997f11c516344c9ac0ffaf8649dc071
3
+ size 845184813
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": ["<s>", "</s>", "<2km>", "<2en>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b052ec9a665776835d69c50d2226e5e55574db8fbbce5563f217cbe18f91d41
3
+ size 783261
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "use_fast": false, "strip_accents": false, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "models/tokenizers/albert-kmen32k"}