Training in progress, step 60
Browse files- .ipynb_checkpoints/eval-checkpoint.py +20 -2
- eval.py +20 -2
- pytorch_model.bin +1 -1
.ipynb_checkpoints/eval-checkpoint.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import argparse
|
| 3 |
import re
|
|
|
|
| 4 |
from typing import Dict
|
| 5 |
|
| 6 |
import torch
|
|
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
| 50 |
def normalize_text(text: str) -> str:
|
| 51 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
| 56 |
|
|
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
|
|
| 59 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
| 60 |
|
| 61 |
for t in token_sequences_to_ignore:
|
| 62 |
-
text = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
return text
|
| 65 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import argparse
|
| 3 |
import re
|
| 4 |
+
import string
|
| 5 |
from typing import Dict
|
| 6 |
|
| 7 |
import torch
|
|
|
|
| 51 |
def normalize_text(text: str) -> str:
|
| 52 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
| 53 |
|
| 54 |
+
chars_to_ignore = [
|
| 55 |
+
",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
|
| 56 |
+
"؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
|
| 57 |
+
"{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
|
| 58 |
+
"、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
|
| 59 |
+
"『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
|
| 60 |
+
] # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
| 61 |
+
|
| 62 |
+
chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
|
| 63 |
|
| 64 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
| 65 |
|
|
|
|
| 68 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
| 69 |
|
| 70 |
for t in token_sequences_to_ignore:
|
| 71 |
+
text = "".join(text.split(t))
|
| 72 |
+
|
| 73 |
+
# convert 'D' and 'd' to '啲' if there a 'D' in sentence
|
| 74 |
+
# hacky stuff, wont work on 'D', 'd' co-occure with normal english words
|
| 75 |
+
# wont work on multiple 'D'
|
| 76 |
+
if "d" in text:
|
| 77 |
+
if len([c for c in text if c in string.ascii_lowercase]) == 1:
|
| 78 |
+
text = text.replace("d", "啲")
|
| 79 |
+
|
| 80 |
+
text += ' '
|
| 81 |
|
| 82 |
return text
|
| 83 |
|
eval.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import argparse
|
| 3 |
import re
|
|
|
|
| 4 |
from typing import Dict
|
| 5 |
|
| 6 |
import torch
|
|
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
| 50 |
def normalize_text(text: str) -> str:
|
| 51 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
| 56 |
|
|
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
|
|
| 59 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
| 60 |
|
| 61 |
for t in token_sequences_to_ignore:
|
| 62 |
-
text = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
return text
|
| 65 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import argparse
|
| 3 |
import re
|
| 4 |
+
import string
|
| 5 |
from typing import Dict
|
| 6 |
|
| 7 |
import torch
|
|
|
|
| 51 |
def normalize_text(text: str) -> str:
|
| 52 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
| 53 |
|
| 54 |
+
chars_to_ignore = [
|
| 55 |
+
",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
|
| 56 |
+
"؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
|
| 57 |
+
"{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
|
| 58 |
+
"、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
|
| 59 |
+
"『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
|
| 60 |
+
] # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
| 61 |
+
|
| 62 |
+
chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
|
| 63 |
|
| 64 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
| 65 |
|
|
|
|
| 68 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
| 69 |
|
| 70 |
for t in token_sequences_to_ignore:
|
| 71 |
+
text = "".join(text.split(t))
|
| 72 |
+
|
| 73 |
+
# convert 'D' and 'd' to '啲' if there a 'D' in sentence
|
| 74 |
+
# hacky stuff, wont work on 'D', 'd' co-occure with normal english words
|
| 75 |
+
# wont work on multiple 'D'
|
| 76 |
+
if "d" in text:
|
| 77 |
+
if len([c for c in text if c in string.ascii_lowercase]) == 1:
|
| 78 |
+
text = text.replace("d", "啲")
|
| 79 |
+
|
| 80 |
+
text += ' '
|
| 81 |
|
| 82 |
return text
|
| 83 |
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1278024433
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2cf9654583b75dea424b875769f5c205aadeff4ef6f019a7717d32a2d023c8d6
|
| 3 |
size 1278024433
|