JonusNattapong commited on
Commit
7784060
·
verified ·
1 Parent(s): 1630c96

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. usage_examples.json +1 -1
usage_examples.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"AdvancedThaiTokenizerV2/tokenizer.json\")\n\n# Encode Thai text\ntext = \"สวัสดีครับ วันนี้อากาศดีมาก\"\nencoding = tokenizer.encode(text)\n\n# Best decoding method for Thai\ndecoded = \"\"\nfor token in encoding.tokens:\n if not (token.startswith('<') and token.endswith('>')):\n decoded += token\n\nprint(f\"Original: {text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n",
3
  "batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n \"กินข้าวยัง\",\n \"ไปไหนมา\", \n \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n encoding = tokenizer.encode(sentence)\n # Use manual decoding for best results\n decoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n print(f\"{sentence} -> {decoded}\")\n",
4
  "mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
5
  }
 
1
  {
2
+ "basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"AdvancedThaiTokenizerV3/tokenizer.json\")\n\n# Encode Thai text\ntext = \"สวัสดีครับ วันนี้อากาศดีมาก\"\nencoding = tokenizer.encode(text)\n\n# Best decoding method for Thai\ndecoded = \"\"\nfor token in encoding.tokens:\n if not (token.startswith('<') and token.endswith('>')):\n decoded += token\n\nprint(f\"Original: {text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n",
3
  "batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n \"กินข้าวยัง\",\n \"ไปไหนมา\", \n \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n encoding = tokenizer.encode(sentence)\n # Use manual decoding for best results\n decoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n print(f\"{sentence} -> {decoded}\")\n",
4
  "mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
5
  }