keenanpepper commited on
Commit
1724e2d
Β·
verified Β·
1 Parent(s): db43988

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +21 -3
  2. special_tokens_map.json +6 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +6 -2
README.md CHANGED
@@ -36,18 +36,36 @@ Trained on bilingual Japanese-English story data with masked loss on Japanese pr
36
  ## Usage
37
 
38
  ```python
39
- from transformers import LlamaForCausalLM, LlamaTokenizer
40
 
41
  model = LlamaForCausalLM.from_pretrained("one-way-polyglot-8m-tied")
42
- tokenizer = LlamaTokenizer.from_pretrained("one-way-polyglot-8m-tied")
43
 
44
- # Japanese input, English output
45
  prompt = "ζ˜”γ€…γ€θ΅€γ„ε‚˜γ‚’ζŒγ£γŸε°‘ε₯³γŒγ„γΎγ—γŸγ€‚"
46
  inputs = tokenizer(prompt, return_tensors="pt")
47
  outputs = model.generate(**inputs, max_new_tokens=50, temperature=0.7)
48
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
 
 
 
 
 
 
 
 
 
 
 
49
  ```
50
 
 
 
 
 
 
 
51
  ## Model Variants
52
 
53
  This is part of a series exploring one-way polyglot capabilities:
 
36
  ## Usage
37
 
38
  ```python
39
+ from transformers import LlamaForCausalLM, AutoTokenizer
40
 
41
  model = LlamaForCausalLM.from_pretrained("one-way-polyglot-8m-tied")
42
+ tokenizer = AutoTokenizer.from_pretrained("one-way-polyglot-8m-tied")
43
 
44
+ # Japanese input β†’ English output (primary use case)
45
  prompt = "ζ˜”γ€…γ€θ΅€γ„ε‚˜γ‚’ζŒγ£γŸε°‘ε₯³γŒγ„γΎγ—γŸγ€‚"
46
  inputs = tokenizer(prompt, return_tensors="pt")
47
  outputs = model.generate(**inputs, max_new_tokens=50, temperature=0.7)
48
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
49
+
50
+ # Mixed-language name transliteration
51
+ prompt = "ε€ͺιƒŽγ―ε…¬εœ’γ§θŠ±ε­γ¨ιŠγ‚“γ§γ„γΎγ—γŸγ€‚After playing, Taro told Hanako that"
52
+ inputs = tokenizer(prompt, return_tensors="pt")
53
+ outputs = model.generate(**inputs, max_new_tokens=30, temperature=0.7)
54
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
55
+
56
+ # English text (works perfectly with case folding)
57
+ prompt = "Hello World" # Automatically normalized to lowercase
58
+ inputs = tokenizer(prompt, return_tensors="pt")
59
+ outputs = model.generate(**inputs, max_new_tokens=30, temperature=0.7)
60
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
61
  ```
62
 
63
+ ### Tokenizer Features
64
+ - **βœ… Case Folding**: "Hello", "hello", and "HELLO" produce identical tokenization
65
+ - **βœ… Japanese Support**: Full Japanese text support with proper normalization
66
+ - **βœ… No UNK Tokens**: Proper handling of uppercase/lowercase English text
67
+ - **βœ… SentencePiece Compatibility**: Built using proper Unigram model with normalization
68
+
69
  ## Model Variants
70
 
71
  This is part of a series exploring one-way polyglot capabilities:
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "unk_token": "[UNK]",
5
+ "pad_token": "[PAD]"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,12 +1,16 @@
1
  {
2
- "tokenizer_class": "LlamaTokenizer",
3
  "vocab_size": 16384,
4
  "model_max_length": 512,
5
  "pad_token": "[PAD]",
6
  "bos_token": "[BOS]",
7
  "eos_token": "[EOS]",
8
  "unk_token": "[UNK]",
9
- "add_bos_token": true,
 
 
 
 
10
  "add_eos_token": false,
11
  "clean_up_tokenization_spaces": false
12
  }
 
1
  {
2
+ "tokenizer_class": "PreTrainedTokenizerFast",
3
  "vocab_size": 16384,
4
  "model_max_length": 512,
5
  "pad_token": "[PAD]",
6
  "bos_token": "[BOS]",
7
  "eos_token": "[EOS]",
8
  "unk_token": "[UNK]",
9
+ "pad_token_id": 3,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "unk_token_id": 0,
13
+ "add_bos_token": false,
14
  "add_eos_token": false,
15
  "clean_up_tokenization_spaces": false
16
  }