Tom Aarsen
commited on
Commit
·
f9c9b72
1
Parent(s):
6e99faa
Reflect that JW300 was removed
Browse files
train.py
CHANGED
|
@@ -66,13 +66,6 @@ def main():
|
|
| 66 |
global_voices_eval_dataset: Dataset = global_voices_dataset_dict["test"]
|
| 67 |
print("Loaded global voices dataset.")
|
| 68 |
|
| 69 |
-
print("Loading jw300 dataset...")
|
| 70 |
-
jw300_dataset = load_dataset("sentence-transformers/parallel-sentences-jw300", "all", split="train")
|
| 71 |
-
jw300_dataset_dict = jw300_dataset.train_test_split(test_size=10_000, seed=12)
|
| 72 |
-
jw300_train_dataset: Dataset = jw300_dataset_dict["train"]
|
| 73 |
-
jw300_eval_dataset: Dataset = jw300_dataset_dict["test"]
|
| 74 |
-
print("Loaded jw300 dataset.")
|
| 75 |
-
|
| 76 |
print("Loading muse dataset...")
|
| 77 |
muse_dataset = load_dataset("sentence-transformers/parallel-sentences-muse", split="train")
|
| 78 |
muse_dataset_dict = muse_dataset.train_test_split(test_size=10_000, seed=12)
|
|
@@ -168,7 +161,6 @@ def main():
|
|
| 168 |
"talks": talks_train_dataset,
|
| 169 |
"europarl": europarl_train_dataset,
|
| 170 |
"global_voices": global_voices_train_dataset,
|
| 171 |
-
"jw300": jw300_train_dataset,
|
| 172 |
"muse": muse_train_dataset,
|
| 173 |
"wikimatrix": wikimatrix_train_dataset,
|
| 174 |
"opensubtitles": opensubtitles_train_dataset,
|
|
@@ -189,7 +181,6 @@ def main():
|
|
| 189 |
"talks": talks_eval_dataset,
|
| 190 |
"europarl": europarl_eval_dataset,
|
| 191 |
"global_voices": global_voices_eval_dataset,
|
| 192 |
-
"jw300": jw300_eval_dataset,
|
| 193 |
"muse": muse_eval_dataset,
|
| 194 |
"wikimatrix": wikimatrix_eval_dataset,
|
| 195 |
"opensubtitles": opensubtitles_eval_dataset,
|
|
|
|
| 66 |
global_voices_eval_dataset: Dataset = global_voices_dataset_dict["test"]
|
| 67 |
print("Loaded global voices dataset.")
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
print("Loading muse dataset...")
|
| 70 |
muse_dataset = load_dataset("sentence-transformers/parallel-sentences-muse", split="train")
|
| 71 |
muse_dataset_dict = muse_dataset.train_test_split(test_size=10_000, seed=12)
|
|
|
|
| 161 |
"talks": talks_train_dataset,
|
| 162 |
"europarl": europarl_train_dataset,
|
| 163 |
"global_voices": global_voices_train_dataset,
|
|
|
|
| 164 |
"muse": muse_train_dataset,
|
| 165 |
"wikimatrix": wikimatrix_train_dataset,
|
| 166 |
"opensubtitles": opensubtitles_train_dataset,
|
|
|
|
| 181 |
"talks": talks_eval_dataset,
|
| 182 |
"europarl": europarl_eval_dataset,
|
| 183 |
"global_voices": global_voices_eval_dataset,
|
|
|
|
| 184 |
"muse": muse_eval_dataset,
|
| 185 |
"wikimatrix": wikimatrix_eval_dataset,
|
| 186 |
"opensubtitles": opensubtitles_eval_dataset,
|