JonusNattapong commited on
Commit
55f2143
·
verified ·
1 Parent(s): 27c05b7

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +38 -0
  2. metadata.json +485 -0
  3. tokenizer.json +0 -0
  4. usage_examples.json +5 -0
  5. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Advanced Thai Tokenizer V3
2
+
3
+ ## Overview
4
+ Advanced Thai language tokenizer with improved handling of Thai text, mixed content, and modern vocabulary.
5
+
6
+ ## Performance
7
+ - Overall Accuracy: 24/24 (100.0%)
8
+ - Vocabulary Size: 35,590 tokens
9
+ - Average Compression: 3.45 chars/token
10
+
11
+ ## Key Features
12
+ - ✅ No Thai character corruption
13
+ - ✅ Handles mixed Thai-English content
14
+ - ✅ Modern vocabulary (internet, technology terms)
15
+ - ✅ Efficient compression
16
+ - ✅ Clean decoding without artifacts
17
+
18
+ ## Quick Start
19
+ ```python
20
+ from tokenizers import Tokenizer
21
+
22
+ tokenizer = Tokenizer.from_file("tokenizer.json")
23
+ text = "สวัสดีครับ วันนี้อากาศดีมาก"
24
+ encoding = tokenizer.encode(text)
25
+
26
+ # Best decoding method
27
+ decoded = "".join(token for token in encoding.tokens
28
+ if not (token.startswith('<') and token.endswith('>')))
29
+ ```
30
+
31
+ ## Files
32
+ - `tokenizer.json` - Main tokenizer file
33
+ - `vocab.json` - Vocabulary mapping
34
+ - `metadata.json` - Performance and configuration details
35
+ - `usage_examples.json` - Code examples
36
+ - `README.md` - This file
37
+
38
+ Created: July 2025
metadata.json ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_info": {
3
+ "version": "2.0",
4
+ "model_type": "unigram",
5
+ "vocab_size": 35590,
6
+ "creation_date": "2025-07-02",
7
+ "language": "thai",
8
+ "description": "Advanced Thai tokenizer with improved handling of Thai text, mixed content, and modern vocabulary"
9
+ },
10
+ "performance": {
11
+ "test_results": {
12
+ "overall": {
13
+ "passed": 24,
14
+ "total": 24
15
+ },
16
+ "categories": {
17
+ "basic_thai": {
18
+ "passed": 4,
19
+ "total": 4,
20
+ "details": [
21
+ {
22
+ "input": "สวัสดี",
23
+ "tokens": [
24
+ "สวัสด",
25
+ "ี"
26
+ ],
27
+ "token_count": 2,
28
+ "decoded": "สวัสดี",
29
+ "success": true
30
+ },
31
+ {
32
+ "input": "ขอบคุณ",
33
+ "tokens": [
34
+ "ขอบ",
35
+ "คุณ"
36
+ ],
37
+ "token_count": 2,
38
+ "decoded": "ขอบคุณ",
39
+ "success": true
40
+ },
41
+ {
42
+ "input": "ครับ",
43
+ "tokens": [
44
+ "ครับ"
45
+ ],
46
+ "token_count": 1,
47
+ "decoded": "ครับ",
48
+ "success": true
49
+ },
50
+ {
51
+ "input": "ค่ะ",
52
+ "tokens": [
53
+ "ค่ะ"
54
+ ],
55
+ "token_count": 1,
56
+ "decoded": "ค่ะ",
57
+ "success": true
58
+ }
59
+ ]
60
+ },
61
+ "thai_with_spaces": {
62
+ "passed": 3,
63
+ "total": 3,
64
+ "details": [
65
+ {
66
+ "input": "กิน ข้าว อร่อย",
67
+ "tokens": [
68
+ "กิน",
69
+ " ",
70
+ "ข้าว",
71
+ " ",
72
+ "อ",
73
+ "ร่อย"
74
+ ],
75
+ "token_count": 6,
76
+ "decoded": "กิน ข้าว อร่อย",
77
+ "success": true
78
+ },
79
+ {
80
+ "input": "วันนี้ อากาศ ดี",
81
+ "tokens": [
82
+ "วัน",
83
+ "นี้",
84
+ " ",
85
+ "อากาศ",
86
+ " ",
87
+ "ด",
88
+ "ี"
89
+ ],
90
+ "token_count": 7,
91
+ "decoded": "วันนี้ อากาศ ดี",
92
+ "success": true
93
+ },
94
+ {
95
+ "input": "ผม ชื่อ จอห์น",
96
+ "tokens": [
97
+ "ผ",
98
+ "ม",
99
+ " ",
100
+ "ชื่อ",
101
+ " ",
102
+ "จอห์น"
103
+ ],
104
+ "token_count": 6,
105
+ "decoded": "ผม ชื่อ จอห์น",
106
+ "success": true
107
+ }
108
+ ]
109
+ },
110
+ "mixed_content": {
111
+ "passed": 3,
112
+ "total": 3,
113
+ "details": [
114
+ {
115
+ "input": "123 สวัสดี abc",
116
+ "tokens": [
117
+ "1",
118
+ "2",
119
+ "3",
120
+ " ",
121
+ "สวัสด",
122
+ "ี",
123
+ " ",
124
+ "abc"
125
+ ],
126
+ "token_count": 8,
127
+ "decoded": "123 สวัสดี abc",
128
+ "success": true
129
+ },
130
+ {
131
+ "input": "Hello ครับ",
132
+ "tokens": [
133
+ "Hello",
134
+ " ",
135
+ "ครับ"
136
+ ],
137
+ "token_count": 3,
138
+ "decoded": "Hello ครับ",
139
+ "success": true
140
+ },
141
+ {
142
+ "input": "COVID-19 ระบาด",
143
+ "tokens": [
144
+ "COVID",
145
+ "-",
146
+ "1",
147
+ "9",
148
+ " ",
149
+ "ระบาด"
150
+ ],
151
+ "token_count": 6,
152
+ "decoded": "COVID-19 ระบาด",
153
+ "success": true
154
+ }
155
+ ]
156
+ },
157
+ "formal_thai": {
158
+ "passed": 2,
159
+ "total": 2,
160
+ "details": [
161
+ {
162
+ "input": "พระบาทสมเด็จพระเจ้าอยู่หัว",
163
+ "tokens": [
164
+ "พระบาทสมเด็จพระ",
165
+ "เจ้าอยู่หัว"
166
+ ],
167
+ "token_count": 2,
168
+ "decoded": "พระบาทสมเด็จพระเจ้าอยู่หัว",
169
+ "success": true
170
+ },
171
+ {
172
+ "input": "การประชุมสำคัญ",
173
+ "tokens": [
174
+ "การประชุม",
175
+ "สำคัญ"
176
+ ],
177
+ "token_count": 2,
178
+ "decoded": "การประชุมสำคัญ",
179
+ "success": true
180
+ }
181
+ ]
182
+ },
183
+ "casual_thai": {
184
+ "passed": 3,
185
+ "total": 3,
186
+ "details": [
187
+ {
188
+ "input": "อร่อยจัง",
189
+ "tokens": [
190
+ "อ",
191
+ "ร่อย",
192
+ "จัง"
193
+ ],
194
+ "token_count": 3,
195
+ "decoded": "อร่อยจัง",
196
+ "success": true
197
+ },
198
+ {
199
+ "input": "แพงมาก",
200
+ "tokens": [
201
+ "แพง",
202
+ "มาก"
203
+ ],
204
+ "token_count": 2,
205
+ "decoded": "แพงมาก",
206
+ "success": true
207
+ },
208
+ {
209
+ "input": "ถูกมาก",
210
+ "tokens": [
211
+ "ถูก",
212
+ "มาก"
213
+ ],
214
+ "token_count": 2,
215
+ "decoded": "ถูกมาก",
216
+ "success": true
217
+ }
218
+ ]
219
+ },
220
+ "complex_thai": {
221
+ "passed": 3,
222
+ "total": 3,
223
+ "details": [
224
+ {
225
+ "input": "กรุงเทพมหานคร",
226
+ "tokens": [
227
+ "กรุงเทพ",
228
+ "มหา",
229
+ "นคร"
230
+ ],
231
+ "token_count": 3,
232
+ "decoded": "กรุงเทพมหานคร",
233
+ "success": true
234
+ },
235
+ {
236
+ "input": "ราชมงคลธัญบุรี",
237
+ "tokens": [
238
+ "ราช",
239
+ "มงคล",
240
+ "ธัญ",
241
+ "บุรี"
242
+ ],
243
+ "token_count": 4,
244
+ "decoded": "ราชมงคลธัญบุรี",
245
+ "success": true
246
+ },
247
+ {
248
+ "input": "จุฬาลงกรณ์มหาวิทยาลัย",
249
+ "tokens": [
250
+ "จุฬาล",
251
+ "ง",
252
+ "กรณ์",
253
+ "มหาวิทยาลัย"
254
+ ],
255
+ "token_count": 4,
256
+ "decoded": "จุฬาลงกรณ์มหาวิทยาลัย",
257
+ "success": true
258
+ }
259
+ ]
260
+ },
261
+ "numbers_dates": {
262
+ "passed": 3,
263
+ "total": 3,
264
+ "details": [
265
+ {
266
+ "input": "1 มกราคม 2567",
267
+ "tokens": [
268
+ "1",
269
+ " ",
270
+ "มกรา",
271
+ "ค",
272
+ "ม",
273
+ " ",
274
+ "2",
275
+ "567"
276
+ ],
277
+ "token_count": 8,
278
+ "decoded": "1 มกราคม 2567",
279
+ "success": true
280
+ },
281
+ {
282
+ "input": "เวลา 14:30 น.",
283
+ "tokens": [
284
+ "เวลา",
285
+ " ",
286
+ "1",
287
+ "4",
288
+ ":",
289
+ "30",
290
+ " ",
291
+ "น",
292
+ "."
293
+ ],
294
+ "token_count": 9,
295
+ "decoded": "เวลา 14:30 น.",
296
+ "success": true
297
+ },
298
+ {
299
+ "input": "ราคา 1,234 บาท",
300
+ "tokens": [
301
+ "ราคา",
302
+ " ",
303
+ "1",
304
+ ",",
305
+ "2",
306
+ "34",
307
+ " ",
308
+ "บาท"
309
+ ],
310
+ "token_count": 8,
311
+ "decoded": "ราคา 1,234 บาท",
312
+ "success": true
313
+ }
314
+ ]
315
+ },
316
+ "technology": {
317
+ "passed": 3,
318
+ "total": 3,
319
+ "details": [
320
+ {
321
+ "input": "อินเทอร์เน็ต",
322
+ "tokens": [
323
+ "อินเทอร์เน็ต"
324
+ ],
325
+ "token_count": 1,
326
+ "decoded": "อินเทอร์เน็ต",
327
+ "success": true
328
+ },
329
+ {
330
+ "input": "โทรศัพท์มือถือ",
331
+ "tokens": [
332
+ "โทรศัพท์",
333
+ "มือถือ"
334
+ ],
335
+ "token_count": 2,
336
+ "decoded": "โทรศัพท์มือถือ",
337
+ "success": true
338
+ },
339
+ {
340
+ "input": "แอปพลิเคชัน",
341
+ "tokens": [
342
+ "แอปพลิเคชั",
343
+ "น"
344
+ ],
345
+ "token_count": 2,
346
+ "decoded": "แอปพลิเคชัน",
347
+ "success": true
348
+ }
349
+ ]
350
+ }
351
+ }
352
+ },
353
+ "efficiency": {
354
+ "compression_ratios": [
355
+ 3.0,
356
+ 2.75,
357
+ 2.6470588235294117,
358
+ 6.7,
359
+ 2.1666666666666665
360
+ ],
361
+ "avg_tokens_per_char": 0.30726256983240224,
362
+ "vocab_coverage": 0.0010958134307389716,
363
+ "details": [
364
+ {
365
+ "sentence": "สวัสดี",
366
+ "char_count": 6,
367
+ "token_count": 2,
368
+ "compression_ratio": 3.0,
369
+ "tokens": [
370
+ "สวัสด",
371
+ "ี"
372
+ ]
373
+ },
374
+ {
375
+ "sentence": "สวัสดีครับ ผมชื่อจอห์น",
376
+ "char_count": 22,
377
+ "token_count": 8,
378
+ "compression_ratio": 2.75,
379
+ "tokens": [
380
+ "สวัสด",
381
+ "ี",
382
+ "ครับ",
383
+ " ",
384
+ "ผ",
385
+ "ม",
386
+ "ชื่อ",
387
+ "จอห์น"
388
+ ]
389
+ },
390
+ {
391
+ "sentence": "วันนี้อากาศดีมาก ผมจึงไปเดินเล่นที่สวนสาธารณะ",
392
+ "char_count": 45,
393
+ "token_count": 17,
394
+ "compression_ratio": 2.6470588235294117,
395
+ "tokens": [
396
+ "วัน",
397
+ "นี้",
398
+ "อากาศ",
399
+ "ด",
400
+ "ี",
401
+ "มาก",
402
+ " ",
403
+ "ผ",
404
+ "ม",
405
+ "จึง",
406
+ "ไป",
407
+ "เดิน",
408
+ "เล่น",
409
+ "ที่",
410
+ "สวน",
411
+ "สาธารณ",
412
+ "ะ"
413
+ ]
414
+ },
415
+ {
416
+ "sentence": "พระบาทสมเด็จพระเจ้าอยู่หัวทรงพระกรุณาโปรดเกล้าฯ ให้จัดงานพระราชพิธี",
417
+ "char_count": 67,
418
+ "token_count": 10,
419
+ "compression_ratio": 6.7,
420
+ "tokens": [
421
+ "พระบาทสมเด็จพระ",
422
+ "เจ้าอยู่หัว",
423
+ "ทรง",
424
+ "พระกรุณา",
425
+ "โปรดเกล้า",
426
+ "ฯ ",
427
+ "ให้",
428
+ "จัด",
429
+ "งาน",
430
+ "พระราชพิธี"
431
+ ]
432
+ },
433
+ {
434
+ "sentence": "555 อร่อยมากกก กินข้าวยัง? #อาหารไทย 🇹🇭",
435
+ "char_count": 39,
436
+ "token_count": 18,
437
+ "compression_ratio": 2.1666666666666665,
438
+ "tokens": [
439
+ "555",
440
+ " ",
441
+ "อ",
442
+ "ร่อย",
443
+ "มาก",
444
+ "ก",
445
+ "ก",
446
+ " ",
447
+ "กิน",
448
+ "ข้าว",
449
+ "ยัง",
450
+ "?",
451
+ " ",
452
+ "#",
453
+ "อาหาร",
454
+ "ไทย",
455
+ " ",
456
+ "🇹🇭"
457
+ ]
458
+ }
459
+ ]
460
+ },
461
+ "overall_accuracy": "24/24"
462
+ },
463
+ "features": [
464
+ "No normalization (preserves Thai characters)",
465
+ "Smart punctuation handling",
466
+ "Mixed Thai-English support",
467
+ "Modern vocabulary coverage",
468
+ "Efficient compression",
469
+ "Direct decoding without artifacts"
470
+ ],
471
+ "usage_notes": {
472
+ "best_decoding": "manual concatenation of non-special tokens",
473
+ "recommended_for": [
474
+ "Thai NLP",
475
+ "LLM training",
476
+ "Text processing",
477
+ "Social media analysis"
478
+ ],
479
+ "avoid": [
480
+ "Text normalization",
481
+ "Byte-level fallback",
482
+ "Aggressive post-processing"
483
+ ]
484
+ }
485
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
usage_examples.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "basic_usage": "\nfrom tokenizers import Tokenizer\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(\"AdvancedThaiTokenizerV2/tokenizer.json\")\n\n# Encode Thai text\ntext = \"สวัสดีครับ วันนี้อากาศดีมาก\"\nencoding = tokenizer.encode(text)\n\n# Best decoding method for Thai\ndecoded = \"\"\nfor token in encoding.tokens:\n if not (token.startswith('<') and token.endswith('>')):\n decoded += token\n\nprint(f\"Original: {text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n",
3
+ "batch_processing": "\n# Process multiple Thai sentences\nsentences = [\n \"กินข้าวยัง\",\n \"ไปไหนมา\", \n \"สบายดีไหม\"\n]\n\nfor sentence in sentences:\n encoding = tokenizer.encode(sentence)\n # Use manual decoding for best results\n decoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n print(f\"{sentence} -> {decoded}\")\n",
4
+ "mixed_content": "\n# Handle Thai-English mixed content\nmixed_text = \"Hello สวัสดี COVID-19 ระบาด\"\nencoding = tokenizer.encode(mixed_text)\n\n# Manual decoding preserves mixed content\ndecoded = \"\".join(token for token in encoding.tokens \n if not (token.startswith('<') and token.endswith('>')))\n\nprint(f\"Mixed: {mixed_text}\")\nprint(f\"Tokens: {encoding.tokens}\")\nprint(f\"Decoded: {decoded}\")\n"
5
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff