Upload 7 files
Browse files- README.md +21 -3
- added_tokens.json +1 -0
- config.json +100 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- spiece.model +3 -0
- tokenizer_config.json +1 -0
README.md
CHANGED
@@ -1,3 +1,21 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
language: km
|
4 |
+
---
|
5 |
+
|
6 |
+
# PrahokBART (big-sized model)
|
7 |
+
|
8 |
+
PrahokBART is a pre-trained sequence-to-sequence model trained from scratch for Khmer using carefully curated Khmer and English corpora. This model was trained considering the linguistic issues of Khmer by incorporating linguistic components such as word segmentation and normalization. This model can be finetuned to build natural language generation application for Khmer such as English<->Khmer translation, summarization, headline generation, etc. This model is more efficient than mBART50. You can read more about PrahokBART in this [paper](https://aclanthology.org/2025.coling-main.87/).
|
9 |
+
|
10 |
+
Finetuning codes are avaiable in [GitHub](https://github.com/hour/prahokbart).
|
11 |
+
|
12 |
+
# Citation
|
13 |
+
```bibtex
|
14 |
+
@inproceedings{kaing2025prahokbart,
|
15 |
+
title={PrahokBART: A Pre-trained Sequence-to-Sequence Model for Khmer Natural Language Generation},
|
16 |
+
author={Kaing, Hour and Dabre, Raj and Song, Haiyue and Tran, Van-Hien and Tanaka, Hideki and Utiyama, Masao},
|
17 |
+
booktitle={Proceedings of the 31st International Conference on Computational Linguistics},
|
18 |
+
pages={1309--1322},
|
19 |
+
year={2025}
|
20 |
+
}
|
21 |
+
```
|
added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<s>": 32000, "</s>": 32001, "<2km>": 32002, "<2en>": 32003}
|
config.json
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_dropout": 0.1,
|
3 |
+
"activation_function": "gelu",
|
4 |
+
"adaptor_activation_function": "gelu",
|
5 |
+
"adaptor_dropout": 0.1,
|
6 |
+
"adaptor_hidden_size": 512,
|
7 |
+
"adaptor_init_std": 0.02,
|
8 |
+
"adaptor_scaling_factor": 1.0,
|
9 |
+
"adaptor_tuning": false,
|
10 |
+
"additional_source_wait_k": -1,
|
11 |
+
"alibi_encoding": false,
|
12 |
+
"architectures": [
|
13 |
+
"MBartForConditionalGeneration"
|
14 |
+
],
|
15 |
+
"asymmetric_alibi_encoding": false,
|
16 |
+
"attention_dropout": 0.1,
|
17 |
+
"bos_token_id": 32000,
|
18 |
+
"bottleneck_mid_fusion_tokens": 4,
|
19 |
+
"classifier_dropout": 0.0,
|
20 |
+
"d_model": 1024,
|
21 |
+
"decoder_adaptor_tying_config": null,
|
22 |
+
"decoder_attention_heads": 16,
|
23 |
+
"decoder_ffn_dim": 4096,
|
24 |
+
"decoder_layerdrop": 0.0,
|
25 |
+
"decoder_layers": 6,
|
26 |
+
"decoder_tying_config": null,
|
27 |
+
"deep_adaptor_tuning": false,
|
28 |
+
"deep_adaptor_tuning_ffn_only": false,
|
29 |
+
"dropout": 0.1,
|
30 |
+
"embed_low_rank_dim": 0,
|
31 |
+
"encoder_adaptor_tying_config": null,
|
32 |
+
"encoder_attention_heads": 16,
|
33 |
+
"encoder_ffn_dim": 4096,
|
34 |
+
"encoder_layerdrop": 0.0,
|
35 |
+
"encoder_layers": 6,
|
36 |
+
"encoder_tying_config": null,
|
37 |
+
"eos_token_id": 32001,
|
38 |
+
"expert_ffn_size": 128,
|
39 |
+
"features_embed_dims": null,
|
40 |
+
"features_vocab_sizes": null,
|
41 |
+
"gradient_checkpointing": false,
|
42 |
+
"gradient_reversal_for_domain_classifier": false,
|
43 |
+
"hypercomplex": false,
|
44 |
+
"hypercomplex_n": 2,
|
45 |
+
"ia3_adaptors": false,
|
46 |
+
"init_std": 0.02,
|
47 |
+
"initialization_scheme": "static",
|
48 |
+
"is_encoder_decoder": true,
|
49 |
+
"layernorm_adaptor_input": false,
|
50 |
+
"layernorm_prompt_projection": false,
|
51 |
+
"lora_adaptor_rank": 2,
|
52 |
+
"lora_adaptors": false,
|
53 |
+
"max_position_embeddings": 1024,
|
54 |
+
"mid_fusion_layers": 3,
|
55 |
+
"model_type": "mbart",
|
56 |
+
"moe_adaptors": false,
|
57 |
+
"multi_source": false,
|
58 |
+
"multi_source_method": null,
|
59 |
+
"multilayer_softmaxing": null,
|
60 |
+
"no_embed_norm": false,
|
61 |
+
"no_positional_encoding_decoder": false,
|
62 |
+
"no_positional_encoding_encoder": false,
|
63 |
+
"no_projection_prompt": false,
|
64 |
+
"no_scale_attention_embedding": false,
|
65 |
+
"num_domains_for_domain_classifier": 1,
|
66 |
+
"num_experts": 8,
|
67 |
+
"num_hidden_layers": 6,
|
68 |
+
"num_moe_adaptor_experts": 4,
|
69 |
+
"num_prompts": 100,
|
70 |
+
"num_sparsify_blocks": 8,
|
71 |
+
"pad_token_id": 0,
|
72 |
+
"parallel_adaptors": false,
|
73 |
+
"positional_encodings": false,
|
74 |
+
"postnorm_decoder": false,
|
75 |
+
"postnorm_encoder": false,
|
76 |
+
"prompt_dropout": 0.1,
|
77 |
+
"prompt_init_std": 0.02,
|
78 |
+
"prompt_projection_hidden_size": 4096,
|
79 |
+
"prompt_tuning": false,
|
80 |
+
"recurrent_projections": 1,
|
81 |
+
"residual_connection_adaptor": false,
|
82 |
+
"residual_connection_prompt": false,
|
83 |
+
"rope_encoding": false,
|
84 |
+
"scale_embedding": false,
|
85 |
+
"softmax_bias_tuning": false,
|
86 |
+
"softmax_temperature": 1.0,
|
87 |
+
"sparsification_temperature": 3.0,
|
88 |
+
"sparsify_attention": false,
|
89 |
+
"sparsify_ffn": false,
|
90 |
+
"target_vocab_size": 0,
|
91 |
+
"temperature_calibration": false,
|
92 |
+
"tokenizer_class": "AlbertTokenizer",
|
93 |
+
"transformers_version": "4.3.2",
|
94 |
+
"unidirectional_encoder": false,
|
95 |
+
"use_cache": true,
|
96 |
+
"use_moe": false,
|
97 |
+
"use_tanh_activation_prompt": false,
|
98 |
+
"vocab_size": 32004,
|
99 |
+
"wait_k": -1
|
100 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49ea52f2a75af4c0ac05bb98b94e6c07d997f11c516344c9ac0ffaf8649dc071
|
3 |
+
size 845184813
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": ["<s>", "</s>", "<2km>", "<2en>"]}
|
spiece.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b052ec9a665776835d69c50d2226e5e55574db8fbbce5563f217cbe18f91d41
|
3 |
+
size 783261
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "use_fast": false, "strip_accents": false, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "models/tokenizers/albert-kmen32k"}
|