loua19 commited on
Commit
af68eea
·
1 Parent(s): d7346e7

adjust tokenizer config

Browse files
Files changed (2) hide show
  1. tokenization_aria.py +13 -15
  2. tokenizer_config.json +2 -2
tokenization_aria.py CHANGED
@@ -32,16 +32,14 @@ class AriaTokenizer(PreTrainedTokenizer):
32
 
33
  def __init__(
34
  self,
35
- add_bos_token=True,
36
  add_eos_token=True,
37
- add_dim_token=True,
38
  clean_up_tokenization_spaces=False,
39
  use_default_system_prompt=False,
40
  **kwargs,
41
  ):
42
  self._tokenizer = AbsTokenizer()
43
 
44
- self.add_bos_token = add_bos_token
45
  self.add_eos_token = add_eos_token
46
  self.add_dim_token = add_dim_token
47
  self.use_default_system_prompt = use_default_system_prompt
@@ -77,35 +75,35 @@ class AriaTokenizer(PreTrainedTokenizer):
77
  def tokenize(
78
  self,
79
  midi_dict: MidiDict,
80
- add_dim_tok: Optional[bool] = None,
81
- add_eos_tok: Optional[bool] = None,
82
  **kwargs,
83
  ) -> List[Token]:
84
  return self._tokenizer.tokenize(
85
  midi_dict=midi_dict,
86
  add_dim_tok=(
87
- add_dim_tok if add_dim_tok is not None else self.add_dim_token
 
 
88
  ),
89
  add_eos_tok=(
90
- add_eos_tok if add_eos_tok is not None else self.add_eos_token
 
 
91
  ),
92
  )
93
 
94
  def _tokenize(
95
  self,
96
  midi_dict: MidiDict,
97
- add_dim_tok: Optional[bool] = None,
98
- add_eos_tok: Optional[bool] = None,
99
  **kwargs,
100
  ) -> List[Token]:
101
  return self._tokenizer.tokenize(
102
  midi_dict=midi_dict,
103
- add_dim_tok=(
104
- add_dim_tok if add_dim_tok is not None else self.add_dim_token
105
- ),
106
- add_eos_tok=(
107
- add_eos_tok if add_eos_tok is not None else self.add_eos_token
108
- ),
109
  )
110
 
111
  def __call__(
 
32
 
33
  def __init__(
34
  self,
 
35
  add_eos_token=True,
36
+ add_dim_token=False,
37
  clean_up_tokenization_spaces=False,
38
  use_default_system_prompt=False,
39
  **kwargs,
40
  ):
41
  self._tokenizer = AbsTokenizer()
42
 
 
43
  self.add_eos_token = add_eos_token
44
  self.add_dim_token = add_dim_token
45
  self.use_default_system_prompt = use_default_system_prompt
 
75
  def tokenize(
76
  self,
77
  midi_dict: MidiDict,
78
+ add_dim_token: Optional[bool] = None,
79
+ add_eos_token: Optional[bool] = None,
80
  **kwargs,
81
  ) -> List[Token]:
82
  return self._tokenizer.tokenize(
83
  midi_dict=midi_dict,
84
  add_dim_tok=(
85
+ add_dim_token
86
+ if add_dim_token is not None
87
+ else self.add_dim_token
88
  ),
89
  add_eos_tok=(
90
+ add_eos_token
91
+ if add_eos_token is not None
92
+ else self.add_eos_token
93
  ),
94
  )
95
 
96
  def _tokenize(
97
  self,
98
  midi_dict: MidiDict,
99
+ add_dim_token: Optional[bool] = None,
100
+ add_eos_token: Optional[bool] = None,
101
  **kwargs,
102
  ) -> List[Token]:
103
  return self._tokenizer.tokenize(
104
  midi_dict=midi_dict,
105
+ add_dim_tok=add_dim_token,
106
+ add_eos_tok=add_eos_token,
 
 
 
 
107
  )
108
 
109
  def __call__(
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
  "auto_map": {
5
  "AutoTokenizer": [
6
  "tokenization_aria.AriaTokenizer",
 
1
  {
2
+ "add_eos_token": true,
3
+ "add_dim_token": false,
4
  "auto_map": {
5
  "AutoTokenizer": [
6
  "tokenization_aria.AriaTokenizer",