Upload 7 files

Browse files

Files changed (7) hide show

README.md +21 -3
added_tokens.json +1 -0
config.json +100 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
spiece.model +3 -0
tokenizer_config.json +1 -0

README.md CHANGED Viewed

@@ -1,3 +1,21 @@
----
-license: mit
----

+---
+license: mit
+language: km
+---
+# PrahokBART (big-sized model)
+PrahokBART is a pre-trained sequence-to-sequence model trained from scratch for Khmer using carefully curated Khmer and English corpora. This model was trained considering the linguistic issues of Khmer by incorporating linguistic components such as word segmentation and normalization. This model can be finetuned to build natural language generation application for Khmer such as English<->Khmer translation, summarization, headline generation, etc. This model is more efficient than mBART50. You can read more about PrahokBART in this [paper](https://aclanthology.org/2025.coling-main.87/).
+Finetuning codes are avaiable in [GitHub](https://github.com/hour/prahokbart).
+# Citation
+```bibtex
+@inproceedings{kaing2025prahokbart,
+  title={PrahokBART: A Pre-trained Sequence-to-Sequence Model for Khmer Natural Language Generation},
+  author={Kaing, Hour and Dabre, Raj and Song, Haiyue and Tran, Van-Hien and Tanaka, Hideki and Utiyama, Masao},
+  booktitle={Proceedings of the 31st International Conference on Computational Linguistics},
+  pages={1309--1322},
+  year={2025}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<s>": 32000, "</s>": 32001, "<2km>": 32002, "<2en>": 32003}

config.json ADDED Viewed

	@@ -0,0 +1,100 @@

+{
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "adaptor_activation_function": "gelu",
+  "adaptor_dropout": 0.1,
+  "adaptor_hidden_size": 512,
+  "adaptor_init_std": 0.02,
+  "adaptor_scaling_factor": 1.0,
+  "adaptor_tuning": false,
+  "additional_source_wait_k": -1,
+  "alibi_encoding": false,
+  "architectures": [
+    "MBartForConditionalGeneration"
+  ],
+  "asymmetric_alibi_encoding": false,
+  "attention_dropout": 0.1,
+  "bos_token_id": 32000,
+  "bottleneck_mid_fusion_tokens": 4,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_adaptor_tying_config": null,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_tying_config": null,
+  "deep_adaptor_tuning": false,
+  "deep_adaptor_tuning_ffn_only": false,
+  "dropout": 0.1,
+  "embed_low_rank_dim": 0,
+  "encoder_adaptor_tying_config": null,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "encoder_tying_config": null,
+  "eos_token_id": 32001,
+  "expert_ffn_size": 128,
+  "features_embed_dims": null,
+  "features_vocab_sizes": null,
+  "gradient_checkpointing": false,
+  "gradient_reversal_for_domain_classifier": false,
+  "hypercomplex": false,
+  "hypercomplex_n": 2,
+  "ia3_adaptors": false,
+  "init_std": 0.02,
+  "initialization_scheme": "static",
+  "is_encoder_decoder": true,
+  "layernorm_adaptor_input": false,
+  "layernorm_prompt_projection": false,
+  "lora_adaptor_rank": 2,
+  "lora_adaptors": false,
+  "max_position_embeddings": 1024,
+  "mid_fusion_layers": 3,
+  "model_type": "mbart",
+  "moe_adaptors": false,
+  "multi_source": false,
+  "multi_source_method": null,
+  "multilayer_softmaxing": null,
+  "no_embed_norm": false,
+  "no_positional_encoding_decoder": false,
+  "no_positional_encoding_encoder": false,
+  "no_projection_prompt": false,
+  "no_scale_attention_embedding": false,
+  "num_domains_for_domain_classifier": 1,
+  "num_experts": 8,
+  "num_hidden_layers": 6,
+  "num_moe_adaptor_experts": 4,
+  "num_prompts": 100,
+  "num_sparsify_blocks": 8,
+  "pad_token_id": 0,
+  "parallel_adaptors": false,
+  "positional_encodings": false,
+  "postnorm_decoder": false,
+  "postnorm_encoder": false,
+  "prompt_dropout": 0.1,
+  "prompt_init_std": 0.02,
+  "prompt_projection_hidden_size": 4096,
+  "prompt_tuning": false,
+  "recurrent_projections": 1,
+  "residual_connection_adaptor": false,
+  "residual_connection_prompt": false,
+  "rope_encoding": false,
+  "scale_embedding": false,
+  "softmax_bias_tuning": false,
+  "softmax_temperature": 1.0,
+  "sparsification_temperature": 3.0,
+  "sparsify_attention": false,
+  "sparsify_ffn": false,
+  "target_vocab_size": 0,
+  "temperature_calibration": false,
+  "tokenizer_class": "AlbertTokenizer",
+  "transformers_version": "4.3.2",
+  "unidirectional_encoder": false,
+  "use_cache": true,
+  "use_moe": false,
+  "use_tanh_activation_prompt": false,
+  "vocab_size": 32004,
+  "wait_k": -1
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49ea52f2a75af4c0ac05bb98b94e6c07d997f11c516344c9ac0ffaf8649dc071
+size 845184813

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": ["<s>", "</s>", "<2km>", "<2en>"]}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b052ec9a665776835d69c50d2226e5e55574db8fbbce5563f217cbe18f91d41
+size 783261

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"do_lower_case": false, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "use_fast": false, "strip_accents": false, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "models/tokenizers/albert-kmen32k"}