ZombitX64
/

Thaitokenizer

@@ -1,5 +1,5 @@
 {
-  "basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"AdvancedThaiTokenizerV2/tokenizer.json\")\n\n# Encode Thai text\ntext = \"สวัสดีครับ วันนี้อากาศดีมาก\"\nencoding = tokenizer.encode(text)\n\n# Best decoding method for Thai\ndecoded = \"\"\nfor token in encoding.tokens:\n    if not (token.startswith('<') and token.endswith('>')):\n        decoded += token\n\nprint(f\"Original: {text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n",
   "batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n    \"กินข้าวยัง\",\n    \"ไปไหนมา\", \n    \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n    encoding = tokenizer.encode(sentence)\n    # Use manual decoding for best results\n    decoded = \"\".join(token for token in encoding.tokens \n                     if not (token.startswith('<') and token.endswith('>')))\n    print(f\"{sentence} -> {decoded}\")\n",
   "mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n                 if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
 }

 {
+  "basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"AdvancedThaiTokenizerV3/tokenizer.json\")\n\n# Encode Thai text\ntext = \"สวัสดีครับ วันนี้อากาศดีมาก\"\nencoding = tokenizer.encode(text)\n\n# Best decoding method for Thai\ndecoded = \"\"\nfor token in encoding.tokens:\n    if not (token.startswith('<') and token.endswith('>')):\n        decoded += token\n\nprint(f\"Original: {text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n",
   "batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n    \"กินข้าวยัง\",\n    \"ไปไหนมา\", \n    \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n    encoding = tokenizer.encode(sentence)\n    # Use manual decoding for best results\n    decoded = \"\".join(token for token in encoding.tokens \n                     if not (token.startswith('<') and token.endswith('>')))\n    print(f\"{sentence} -> {decoded}\")\n",
   "mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n                 if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
 }