adjust tokenizer config
Browse files- tokenization_aria.py +13 -15
- tokenizer_config.json +2 -2
tokenization_aria.py
CHANGED
@@ -32,16 +32,14 @@ class AriaTokenizer(PreTrainedTokenizer):
|
|
32 |
|
33 |
def __init__(
|
34 |
self,
|
35 |
-
add_bos_token=True,
|
36 |
add_eos_token=True,
|
37 |
-
add_dim_token=
|
38 |
clean_up_tokenization_spaces=False,
|
39 |
use_default_system_prompt=False,
|
40 |
**kwargs,
|
41 |
):
|
42 |
self._tokenizer = AbsTokenizer()
|
43 |
|
44 |
-
self.add_bos_token = add_bos_token
|
45 |
self.add_eos_token = add_eos_token
|
46 |
self.add_dim_token = add_dim_token
|
47 |
self.use_default_system_prompt = use_default_system_prompt
|
@@ -77,35 +75,35 @@ class AriaTokenizer(PreTrainedTokenizer):
|
|
77 |
def tokenize(
|
78 |
self,
|
79 |
midi_dict: MidiDict,
|
80 |
-
|
81 |
-
|
82 |
**kwargs,
|
83 |
) -> List[Token]:
|
84 |
return self._tokenizer.tokenize(
|
85 |
midi_dict=midi_dict,
|
86 |
add_dim_tok=(
|
87 |
-
|
|
|
|
|
88 |
),
|
89 |
add_eos_tok=(
|
90 |
-
|
|
|
|
|
91 |
),
|
92 |
)
|
93 |
|
94 |
def _tokenize(
|
95 |
self,
|
96 |
midi_dict: MidiDict,
|
97 |
-
|
98 |
-
|
99 |
**kwargs,
|
100 |
) -> List[Token]:
|
101 |
return self._tokenizer.tokenize(
|
102 |
midi_dict=midi_dict,
|
103 |
-
add_dim_tok=
|
104 |
-
|
105 |
-
),
|
106 |
-
add_eos_tok=(
|
107 |
-
add_eos_tok if add_eos_tok is not None else self.add_eos_token
|
108 |
-
),
|
109 |
)
|
110 |
|
111 |
def __call__(
|
|
|
32 |
|
33 |
def __init__(
|
34 |
self,
|
|
|
35 |
add_eos_token=True,
|
36 |
+
add_dim_token=False,
|
37 |
clean_up_tokenization_spaces=False,
|
38 |
use_default_system_prompt=False,
|
39 |
**kwargs,
|
40 |
):
|
41 |
self._tokenizer = AbsTokenizer()
|
42 |
|
|
|
43 |
self.add_eos_token = add_eos_token
|
44 |
self.add_dim_token = add_dim_token
|
45 |
self.use_default_system_prompt = use_default_system_prompt
|
|
|
75 |
def tokenize(
|
76 |
self,
|
77 |
midi_dict: MidiDict,
|
78 |
+
add_dim_token: Optional[bool] = None,
|
79 |
+
add_eos_token: Optional[bool] = None,
|
80 |
**kwargs,
|
81 |
) -> List[Token]:
|
82 |
return self._tokenizer.tokenize(
|
83 |
midi_dict=midi_dict,
|
84 |
add_dim_tok=(
|
85 |
+
add_dim_token
|
86 |
+
if add_dim_token is not None
|
87 |
+
else self.add_dim_token
|
88 |
),
|
89 |
add_eos_tok=(
|
90 |
+
add_eos_token
|
91 |
+
if add_eos_token is not None
|
92 |
+
else self.add_eos_token
|
93 |
),
|
94 |
)
|
95 |
|
96 |
def _tokenize(
|
97 |
self,
|
98 |
midi_dict: MidiDict,
|
99 |
+
add_dim_token: Optional[bool] = None,
|
100 |
+
add_eos_token: Optional[bool] = None,
|
101 |
**kwargs,
|
102 |
) -> List[Token]:
|
103 |
return self._tokenizer.tokenize(
|
104 |
midi_dict=midi_dict,
|
105 |
+
add_dim_tok=add_dim_token,
|
106 |
+
add_eos_tok=add_eos_token,
|
|
|
|
|
|
|
|
|
107 |
)
|
108 |
|
109 |
def __call__(
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"
|
4 |
"auto_map": {
|
5 |
"AutoTokenizer": [
|
6 |
"tokenization_aria.AriaTokenizer",
|
|
|
1 |
{
|
2 |
+
"add_eos_token": true,
|
3 |
+
"add_dim_token": false,
|
4 |
"auto_map": {
|
5 |
"AutoTokenizer": [
|
6 |
"tokenization_aria.AriaTokenizer",
|