Upload 5 files

Browse files

Files changed (5) hide show

MONOPHONIC_CHARS.txt +0 -0
POLYPHONIC_CHARS.txt +0 -0
README.md +49 -1
config.py +39 -0
g2pw.onnx +3 -0

MONOPHONIC_CHARS.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

POLYPHONIC_CHARS.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,3 +1,51 @@
 ---
-license: cc-by-4.0
 ---

 ---
+language:
+- yue
+pretty_name: "Cantonese (yue) G2PW model - bert base"
+tags:
+- g2p
+license: "cc-by-4.0"
+task_categories:
+- text2text-generation
+datasets:
+- Naozumi0512/g2p-Cantonese-aggregate
 ---
+# g2pW-canto-20241201-bert-base
+This is a **G2P (Grapheme-to-Phoneme)** model trained on the [Naozumi0512/g2p-Cantonese-aggregate](https://huggingface.co/datasets/Naozumi0512/g2p-Cantonese-aggregate) dataset and evaluated on the [yue-g2p-benchmark](https://github.com/hon9kon9ize/yue-g2p-benchmark).
+## Model Overview
+The model uses **[hon9kon9ize/bert-base-cantonese](https://huggingface.co/hon9kon9ize/bert-base-cantonese)**. For more details see https://github.com/Naozumi520/g2pW-Cantonese .
+---
+## Dataset
+The model was trained on the [Naozumi0512/g2p-Cantonese-aggregate](https://huggingface.co/datasets/Naozumi0512/g2p-Cantonese-aggregate) dataset, which includes:
+- **68,500 Cantonese words/phrases** with corresponding phonetic transcriptions.
+- Data is formatted to align with the **CPP (Chinese Polyphones with Pinyin)** structure.
+- Sources include:
+  - Rime Cantonese Input Schema (`jyut6ping3.words.dict.yaml`)
+  - 粵典 Words.hk
+  - CantoDict
+---
+## Evaluation
+The model was evaluated on the [yue-g2p-benchmark](https://github.com/hon9kon9ize/yue-g2p-benchmark):
+| Metric                  | Score  |
+|-------------------------|--------|
+| **Accuracy**            | 0.6873 |
+| **Levenshtein Distance**| 0.1789 |
+| **Phoneme Error Rate**  | 0.2083 |
+---
+## Inference
+https://github.com/Naozumi520/g2pW-Cantonese

config.py ADDED Viewed

	@@ -0,0 +1,39 @@

+root = './rimeExtract_dataset/'
+manual_seed = 1313
+model_source = './bert-base-cantonese'
+polyphonic_chars_path = root + 'POLYPHONIC_CHARS.txt'
+window_size = 32
+num_workers = 2
+use_mask = True
+use_conditional = True
+param_conditional = {
+    'bias': True,
+    'char-linear': True,
+    'pos-linear': False,
+    'char+pos-second': True,
+}
+# for training
+exp_name = '20241206_BERT_B_DescWS-Sec-cLin-B_POS_hkcancor_w03'
+train_sent_path = root + 'train.sent'
+train_lb_path = root + 'train.lb'
+valid_sent_path = root + 'dev.sent'
+valid_lb_path = root + 'dev.lb'
+test_sent_path = root + 'test.sent'
+test_lb_path = root + 'test.lb'
+batch_size = 128
+lr = 5e-5
+val_interval = 200
+num_iter = 13000
+use_pos = True
+param_pos = {
+    'weight': 0.3,
+    'pos_joint_training': True,
+    # 'train_pos_path': root + 'train.pos',
+    # 'valid_pos_path': root + 'dev.pos',
+    # 'test_pos_path': root + 'test.pos',
+    'train_pos_path': root + 'train_hkcancor.pos',
+    'valid_pos_path': root + 'dev_hkcancor.pos',
+    'test_pos_path': root + 'test_hkcancor.pos',
+}

g2pw.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d04732ba7697b617e17e8ffc0895cb22c5db5f96f12b481b438eeee5d53f9d7
+size 1203023863