Upload folder using huggingface_hub
Browse files- usage_examples.json +1 -1
usage_examples.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"
|
3 |
"batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n \"กินข้าวยัง\",\n \"ไปไหนมา\", \n \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n encoding = tokenizer.encode(sentence)\n # Use manual decoding for best results\n decoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n print(f\"{sentence} -> {decoded}\")\n",
|
4 |
"mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
|
5 |
}
|
|
|
1 |
{
|
2 |
+
"basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"AdvancedThaiTokenizerV3/tokenizer.json\")\n\n# Encode Thai text\ntext = \"สวัสดีครับ วันนี้อากาศดีมาก\"\nencoding = tokenizer.encode(text)\n\n# Best decoding method for Thai\ndecoded = \"\"\nfor token in encoding.tokens:\n if not (token.startswith('<') and token.endswith('>')):\n decoded += token\n\nprint(f\"Original: {text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n",
|
3 |
"batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n \"กินข้าวยัง\",\n \"ไปไหนมา\", \n \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n encoding = tokenizer.encode(sentence)\n # Use manual decoding for best results\n decoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n print(f\"{sentence} -> {decoded}\")\n",
|
4 |
"mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
|
5 |
}
|