Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +38 -0
metadata.json +485 -0
tokenizer.json +0 -0
usage_examples.json +5 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Advanced Thai Tokenizer V3
+## Overview
+Advanced Thai language tokenizer with improved handling of Thai text, mixed content, and modern vocabulary.
+## Performance
+- Overall Accuracy: 24/24 (100.0%)
+- Vocabulary Size: 35,590 tokens
+- Average Compression: 3.45 chars/token
+## Key Features
+- ✅ No Thai character corruption
+- ✅ Handles mixed Thai-English content
+- ✅ Modern vocabulary (internet, technology terms)
+- ✅ Efficient compression
+- ✅ Clean decoding without artifacts
+## Quick Start
+```python
+from tokenizers import Tokenizer
+tokenizer = Tokenizer.from_file("tokenizer.json")
+text = "สวัสดีครับ วันนี้อากาศดีมาก"
+encoding = tokenizer.encode(text)
+# Best decoding method
+decoded = "".join(token for token in encoding.tokens
+                 if not (token.startswith('<') and token.endswith('>')))
+```
+## Files
+- `tokenizer.json` - Main tokenizer file
+- `vocab.json` - Vocabulary mapping
+- `metadata.json` - Performance and configuration details
+- `usage_examples.json` - Code examples
+- `README.md` - This file
+Created: July 2025

metadata.json ADDED Viewed

	@@ -0,0 +1,485 @@

+{
+  "model_info": {
+    "version": "2.0",
+    "model_type": "unigram",
+    "vocab_size": 35590,
+    "creation_date": "2025-07-02",
+    "language": "thai",
+    "description": "Advanced Thai tokenizer with improved handling of Thai text, mixed content, and modern vocabulary"
+  },
+  "performance": {
+    "test_results": {
+      "overall": {
+        "passed": 24,
+        "total": 24
+      },
+      "categories": {
+        "basic_thai": {
+          "passed": 4,
+          "total": 4,
+          "details": [
+            {
+              "input": "สวัสดี",
+              "tokens": [
+                "สวัสด",
+                "ี"
+              ],
+              "token_count": 2,
+              "decoded": "สวัสดี",
+              "success": true
+            },
+            {
+              "input": "ขอบคุณ",
+              "tokens": [
+                "ขอบ",
+                "คุณ"
+              ],
+              "token_count": 2,
+              "decoded": "ขอบคุณ",
+              "success": true
+            },
+            {
+              "input": "ครับ",
+              "tokens": [
+                "ครับ"
+              ],
+              "token_count": 1,
+              "decoded": "ครับ",
+              "success": true
+            },
+            {
+              "input": "ค่ะ",
+              "tokens": [
+                "ค่ะ"
+              ],
+              "token_count": 1,
+              "decoded": "ค่ะ",
+              "success": true
+            }
+          ]
+        },
+        "thai_with_spaces": {
+          "passed": 3,
+          "total": 3,
+          "details": [
+            {
+              "input": "กิน ข้าว อร่อย",
+              "tokens": [
+                "กิน",
+                " ",
+                "ข้าว",
+                " ",
+                "อ",
+                "ร่อย"
+              ],
+              "token_count": 6,
+              "decoded": "กิน ข้าว อร่อย",
+              "success": true
+            },
+            {
+              "input": "วันนี้ อากาศ ดี",
+              "tokens": [
+                "วัน",
+                "นี้",
+                " ",
+                "อากาศ",
+                " ",
+                "ด",
+                "ี"
+              ],
+              "token_count": 7,
+              "decoded": "วันนี้ อากาศ ดี",
+              "success": true
+            },
+            {
+              "input": "ผม ชื่อ จอห์น",
+              "tokens": [
+                "ผ",
+                "ม",
+                " ",
+                "ชื่อ",
+                " ",
+                "จอห์น"
+              ],
+              "token_count": 6,
+              "decoded": "ผม ชื่อ จอห์น",
+              "success": true
+            }
+          ]
+        },
+        "mixed_content": {
+          "passed": 3,
+          "total": 3,
+          "details": [
+            {
+              "input": "123 สวัสดี abc",
+              "tokens": [
+                "1",
+                "2",
+                "3",
+                " ",
+                "สวัสด",
+                "ี",
+                " ",
+                "abc"
+              ],
+              "token_count": 8,
+              "decoded": "123 สวัสดี abc",
+              "success": true
+            },
+            {
+              "input": "Hello ครับ",
+              "tokens": [
+                "Hello",
+                " ",
+                "ครับ"
+              ],
+              "token_count": 3,
+              "decoded": "Hello ครับ",
+              "success": true
+            },
+            {
+              "input": "COVID-19 ระบาด",
+              "tokens": [
+                "COVID",
+                "-",
+                "1",
+                "9",
+                " ",
+                "ระบาด"
+              ],
+              "token_count": 6,
+              "decoded": "COVID-19 ระบาด",
+              "success": true
+            }
+          ]
+        },
+        "formal_thai": {
+          "passed": 2,
+          "total": 2,
+          "details": [
+            {
+              "input": "พระบาทสมเด็จพระเจ้าอยู่หัว",
+              "tokens": [
+                "พระบาทสมเด็จพระ",
+                "เจ้าอยู่หัว"
+              ],
+              "token_count": 2,
+              "decoded": "พระบาทสมเด็จพระเจ้าอยู่หัว",
+              "success": true
+            },
+            {
+              "input": "การประชุมสำคัญ",
+              "tokens": [
+                "การประชุม",
+                "สำคัญ"
+              ],
+              "token_count": 2,
+              "decoded": "การประชุมสำคัญ",
+              "success": true
+            }
+          ]
+        },
+        "casual_thai": {
+          "passed": 3,
+          "total": 3,
+          "details": [
+            {
+              "input": "อร่อยจัง",
+              "tokens": [
+                "อ",
+                "ร่อย",
+                "จัง"
+              ],
+              "token_count": 3,
+              "decoded": "อร่อยจัง",
+              "success": true
+            },
+            {
+              "input": "แพงมาก",
+              "tokens": [
+                "แพง",
+                "มาก"
+              ],
+              "token_count": 2,
+              "decoded": "แพงมาก",
+              "success": true
+            },
+            {
+              "input": "ถูกมาก",
+              "tokens": [
+                "ถูก",
+                "มาก"
+              ],
+              "token_count": 2,
+              "decoded": "ถูกมาก",
+              "success": true
+            }
+          ]
+        },
+        "complex_thai": {
+          "passed": 3,
+          "total": 3,
+          "details": [
+            {
+              "input": "กรุงเทพมหานคร",
+              "tokens": [
+                "กรุงเทพ",
+                "มหา",
+                "นคร"
+              ],
+              "token_count": 3,
+              "decoded": "กรุงเทพมหานคร",
+              "success": true
+            },
+            {
+              "input": "ราชมงคลธัญบุรี",
+              "tokens": [
+                "ราช",
+                "มงคล",
+                "ธัญ",
+                "บุรี"
+              ],
+              "token_count": 4,
+              "decoded": "ราชมงคลธัญบุรี",
+              "success": true
+            },
+            {
+              "input": "จุฬาลงกรณ์มหาวิทยาลัย",
+              "tokens": [
+                "จุฬาล",
+                "ง",
+                "กรณ์",
+                "มหาวิทยาลัย"
+              ],
+              "token_count": 4,
+              "decoded": "จุฬาลงกรณ์มหาวิทยาลัย",
+              "success": true
+            }
+          ]
+        },
+        "numbers_dates": {
+          "passed": 3,
+          "total": 3,
+          "details": [
+            {
+              "input": "1 มกราคม 2567",
+              "tokens": [
+                "1",
+                " ",
+                "มกรา",
+                "ค",
+                "ม",
+                " ",
+                "2",
+                "567"
+              ],
+              "token_count": 8,
+              "decoded": "1 มกราคม 2567",
+              "success": true
+            },
+            {
+              "input": "เวลา 14:30 น.",
+              "tokens": [
+                "เวลา",
+                " ",
+                "1",
+                "4",
+                ":",
+                "30",
+                " ",
+                "น",
+                "."
+              ],
+              "token_count": 9,
+              "decoded": "เวลา 14:30 น.",
+              "success": true
+            },
+            {
+              "input": "ราคา 1,234 บาท",
+              "tokens": [
+                "ราคา",
+                " ",
+                "1",
+                ",",
+                "2",
+                "34",
+                " ",
+                "บาท"
+              ],
+              "token_count": 8,
+              "decoded": "ราคา 1,234 บาท",
+              "success": true
+            }
+          ]
+        },
+        "technology": {
+          "passed": 3,
+          "total": 3,
+          "details": [
+            {
+              "input": "อินเทอร์เน็ต",
+              "tokens": [
+                "อินเทอร์เน็ต"
+              ],
+              "token_count": 1,
+              "decoded": "อินเทอร์เน็ต",
+              "success": true
+            },
+            {
+              "input": "โทรศัพท์มือถือ",
+              "tokens": [
+                "โทรศัพท์",
+                "มือถือ"
+              ],
+              "token_count": 2,
+              "decoded": "โทรศัพท์มือถือ",
+              "success": true
+            },
+            {
+              "input": "แอปพลิเคชัน",
+              "tokens": [
+                "แอปพลิเคชั",
+                "น"
+              ],
+              "token_count": 2,
+              "decoded": "แอปพลิเคชัน",
+              "success": true
+            }
+          ]
+        }
+      }
+    },
+    "efficiency": {
+      "compression_ratios": [
+        3.0,
+        2.75,
+        2.6470588235294117,
+        6.7,
+        2.1666666666666665
+      ],
+      "avg_tokens_per_char": 0.30726256983240224,
+      "vocab_coverage": 0.0010958134307389716,
+      "details": [
+        {
+          "sentence": "สวัสดี",
+          "char_count": 6,
+          "token_count": 2,
+          "compression_ratio": 3.0,
+          "tokens": [
+            "สวัสด",
+            "ี"
+          ]
+        },
+        {
+          "sentence": "สวัสดีครับ ผมชื่อจอห์น",
+          "char_count": 22,
+          "token_count": 8,
+          "compression_ratio": 2.75,
+          "tokens": [
+            "สวัสด",
+            "ี",
+            "ครับ",
+            " ",
+            "ผ",
+            "ม",
+            "ชื่อ",
+            "จอห์น"
+          ]
+        },
+        {
+          "sentence": "วันนี้อากาศดีมาก ผมจึงไปเดินเล่นที่สวนสาธารณะ",
+          "char_count": 45,
+          "token_count": 17,
+          "compression_ratio": 2.6470588235294117,
+          "tokens": [
+            "วัน",
+            "นี้",
+            "อากาศ",
+            "ด",
+            "ี",
+            "มาก",
+            " ",
+            "ผ",
+            "ม",
+            "จึง",
+            "ไป",
+            "เดิน",
+            "เล่น",
+            "ที่",
+            "สวน",
+            "สาธารณ",
+            "ะ"
+          ]
+        },
+        {
+          "sentence": "พระบาทสมเด็จพระเจ้าอยู่หัวทรงพระกรุณาโปรดเกล้าฯ ให้จัดงานพระราชพิธี",
+          "char_count": 67,
+          "token_count": 10,
+          "compression_ratio": 6.7,
+          "tokens": [
+            "พระบาทสมเด็จพระ",
+            "เจ้าอยู่หัว",
+            "ทรง",
+            "พระกรุณา",
+            "โปรดเกล้า",
+            "ฯ ",
+            "ให้",
+            "จัด",
+            "งาน",
+            "พระราชพิธี"
+          ]
+        },
+        {
+          "sentence": "555 อร่อยมากกก กินข้าวยัง? #อาหารไทย 🇹🇭",
+          "char_count": 39,
+          "token_count": 18,
+          "compression_ratio": 2.1666666666666665,
+          "tokens": [
+            "555",
+            " ",
+            "อ",
+            "ร่อย",
+            "มาก",
+            "ก",
+            "ก",
+            " ",
+            "กิน",
+            "ข้าว",
+            "ยัง",
+            "?",
+            " ",
+            "#",
+            "อาหาร",
+            "ไทย",
+            " ",
+            "🇹🇭"
+          ]
+        }
+      ]
+    },
+    "overall_accuracy": "24/24"
+  },
+  "features": [
+    "No normalization (preserves Thai characters)",
+    "Smart punctuation handling",
+    "Mixed Thai-English support",
+    "Modern vocabulary coverage",
+    "Efficient compression",
+    "Direct decoding without artifacts"
+  ],
+  "usage_notes": {
+    "best_decoding": "manual concatenation of non-special tokens",
+    "recommended_for": [
+      "Thai NLP",
+      "LLM training",
+      "Text processing",
+      "Social media analysis"
+    ],
+    "avoid": [
+      "Text normalization",
+      "Byte-level fallback",
+      "Aggressive post-processing"
+    ]
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

usage_examples.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"AdvancedThaiTokenizerV2/tokenizer.json\")\n\n# Encode Thai text\ntext = \"สวัสดีครับ วันนี้อากาศดีมาก\"\nencoding = tokenizer.encode(text)\n\n# Best decoding method for Thai\ndecoded = \"\"\nfor token in encoding.tokens:\n    if not (token.startswith('<') and token.endswith('>')):\n        decoded += token\n\nprint(f\"Original: {text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n",
+  "batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n    \"กินข้าวยัง\",\n    \"ไปไหนมา\", \n    \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n    encoding = tokenizer.encode(sentence)\n    # Use manual decoding for best results\n    decoded = \"\".join(token for token in encoding.tokens \n                     if not (token.startswith('<') and token.endswith('>')))\n    print(f\"{sentence} -> {decoded}\")\n",
+  "mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n                 if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff