Derur commited on Aug 8

Commit

23d6cc4

verified ·

1 Parent(s): 7a67169

Upload 35 files

Browse files

Files changed (36) hide show

.gitattributes +5 -0
punctuation/vosk-recasepunc-de-0.21.7z +3 -0
punctuation/vosk-recasepunc-de-0.21/README +7 -0
punctuation/vosk-recasepunc-de-0.21/checkpoint +3 -0
punctuation/vosk-recasepunc-de-0.21/de-test.txt +6 -0
punctuation/vosk-recasepunc-de-0.21/de-test.txt.orig +6 -0
punctuation/vosk-recasepunc-de-0.21/example.py +23 -0
punctuation/vosk-recasepunc-de-0.21/recasepunc.py +742 -0
punctuation/vosk-recasepunc-en-0.22.7z +3 -0
punctuation/vosk-recasepunc-en-0.22/README +7 -0
punctuation/vosk-recasepunc-en-0.22/checkpoint +3 -0
punctuation/vosk-recasepunc-en-0.22/example.py +26 -0
punctuation/vosk-recasepunc-en-0.22/recasepunc.py +742 -0
punctuation/vosk-recasepunc-en-0.22/vosk-adapted.txt +17 -0
punctuation/vosk-recasepunc-en-0.22/vosk-adapted.txt.punc +1 -0
punctuation/vosk-recasepunc-ru-0.22.7z +3 -0
punctuation/vosk-recasepunc-ru-0.22/README +7 -0
punctuation/vosk-recasepunc-ru-0.22/checkpoint +3 -0
punctuation/vosk-recasepunc-ru-0.22/example.py +23 -0
punctuation/vosk-recasepunc-ru-0.22/recasepunc.py +743 -0
punctuation/vosk-recasepunc-ru-0.22/ru-test.txt +17 -0
punctuation/vosk-recasepunc-ru-0.22/ru-test.txt.orig +17 -0
speaker_indentification/vosk-model-spk-0.4.7z +3 -0
speaker_indentification/vosk-model-spk-0.4/README.txt +119 -0
speaker_indentification/vosk-model-spk-0.4/final.ext.raw +3 -0
speaker_indentification/vosk-model-spk-0.4/mean.vec +1 -0
speaker_indentification/vosk-model-spk-0.4/mfcc.conf +5 -0
speaker_indentification/vosk-model-spk-0.4/transform.mat +0 -0
tts/vosk-model-tts-ru-0.9-multi.7z +3 -0
tts/vosk-model-tts-ru-0.9-multi/README.md +22 -0
tts/vosk-model-tts-ru-0.9-multi/bert/README.md +39 -0
tts/vosk-model-tts-ru-0.9-multi/bert/model.onnx +3 -0
tts/vosk-model-tts-ru-0.9-multi/bert/vocab.txt +0 -0
tts/vosk-model-tts-ru-0.9-multi/config.json +85 -0
tts/vosk-model-tts-ru-0.9-multi/dictionary +3 -0
tts/vosk-model-tts-ru-0.9-multi/model.onnx +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+punctuation/vosk-recasepunc-de-0.21/checkpoint filter=lfs diff=lfs merge=lfs -text
+punctuation/vosk-recasepunc-en-0.22/checkpoint filter=lfs diff=lfs merge=lfs -text
+punctuation/vosk-recasepunc-ru-0.22/checkpoint filter=lfs diff=lfs merge=lfs -text
+speaker_indentification/vosk-model-spk-0.4/final.ext.raw filter=lfs diff=lfs merge=lfs -text
+tts/vosk-model-tts-ru-0.9-multi/dictionary filter=lfs diff=lfs merge=lfs -text

punctuation/vosk-recasepunc-de-0.21.7z ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42e06dab56196498cde6e89c5e6d4b97cab72942827153b06180f6a156bebc0b
+size 1153855092

punctuation/vosk-recasepunc-de-0.21/README ADDED Viewed

	@@ -0,0 +1,7 @@

+1. Install pytorch and transformers:
+    pip3 install transformers
+2. Run python3 example.py de-test.txt
+3. Compare with de-test.txt.orig

punctuation/vosk-recasepunc-de-0.21/checkpoint ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90c70f58d865013bf1245d3c7fae229d6029ee0b16470c055f81a63e668de685
+size 1315574525

punctuation/vosk-recasepunc-de-0.21/de-test.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+nachdem sein vater schon 1707 starb als reinhart erst elf jahre alt war
+wurde er von hauslehrern in seega erzogen hierauf kam er 1708 in die
+stadtschule nach frankenhausen und war dort von der dritten bis zur
+ersten klasse der bekannte schulmann magister hoffmann stand der schule
+als rektor vor unter dem er publice prodiret hatte also öffentlich
+aufgetreten war um eine rede zu halten

punctuation/vosk-recasepunc-de-0.21/de-test.txt.orig ADDED Viewed

	@@ -0,0 +1,6 @@

+Nachdem sein Vater schon 1707 starb, als Reinhart erst elf Jahre alt war,
+wurde er von Hauslehrern in Seega erzogen. Hierauf kam er 1708 in die
+Stadtschule nach Frankenhausen und war dort von der dritten bis zur
+ersten Klasse. Der bekannte Schulmann Magister Hoffmann stand der Schule
+als Rektor vor, unter dem er publice prodiret hatte, also öffentlich
+aufgetreten war, um eine Rede zu halten.

punctuation/vosk-recasepunc-de-0.21/example.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import sys
+import time
+from transformers import logging
+from recasepunc import CasePuncPredictor
+from recasepunc import WordpieceTokenizer
+from recasepunc import Config
+logging.set_verbosity_error()
+predictor = CasePuncPredictor('checkpoint', lang="de")
+text = " ".join(open(sys.argv[1]).readlines())
+tokens = list(enumerate(predictor.tokenize(text)))
+results = ""
+for token, case_label, punc_label in predictor.predict(tokens, lambda x: x[1]):
+    prediction = predictor.map_punc_label(predictor.map_case_label(token[1], case_label), punc_label)
+    if token[1][0] != '#':
+       results = results + ' ' + prediction
+    else:
+       results = results + prediction
+print (results.strip())

punctuation/vosk-recasepunc-de-0.21/recasepunc.py ADDED Viewed

	@@ -0,0 +1,742 @@

+import sys
+import collections
+import os
+import regex as re
+#from mosestokenizer import *
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import random
+import unicodedata
+import numpy as np
+import argparse
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel, AutoTokenizer, BertTokenizer
+default_config = argparse.Namespace(
+    seed=871253,
+    lang='de',
+    #flavor='flaubert/flaubert_base_uncased',
+    flavor=None,
+    max_length=256,
+    batch_size=16,
+    updates=24000,
+    period=1000,
+    lr=1e-5,
+    dab_rate=0.1,
+    device='cuda',
+    debug=False
+)
+default_flavors = {
+    'fr': 'flaubert/flaubert_base_uncased',
+    'en': 'bert-base-uncased',
+    'zh': 'ckiplab/bert-base-chinese',
+    'tr': 'dbmdz/bert-base-turkish-uncased',
+    'de': 'dbmdz/bert-base-german-uncased',
+    'pt': 'neuralmind/bert-base-portuguese-cased'
+}
+class Config(argparse.Namespace):
+    def __init__(self, **kwargs):
+        for key, value in default_config.__dict__.items():
+            setattr(self, key, value)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        assert self.lang in ['fr', 'en', 'zh', 'tr', 'pt', 'de']
+        if 'lang' in kwargs and ('flavor' not in kwargs or kwargs['flavor'] is None):
+            self.flavor = default_flavors[self.lang]
+        #print(self.lang, self.flavor)
+def init_random(seed):
+    # make sure everything is deterministic
+    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+    #torch.use_deterministic_algorithms(True)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+# NOTE: it is assumed in the implementation that y[:,0] is the punctuation label, and y[:,1] is the case label!
+punctuation = {
+    'O': 0,
+    'COMMA': 1,
+    'PERIOD': 2,
+    'QUESTION': 3,
+    'EXCLAMATION': 4,
+}
+punctuation_syms = ['', ',', '.', ' ?', ' !']
+case = {
+    'LOWER': 0,
+    'UPPER': 1,
+    'CAPITALIZE': 2,
+    'OTHER': 3,
+}
+class Model(nn.Module):
+    def __init__(self, flavor, device):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(flavor)
+        # need a proper way of determining representation size
+        size = self.bert.dim if hasattr(self.bert, 'dim') else self.bert.config.pooler_fc_size if hasattr(self.bert.config, 'pooler_fc_size') else self.bert.config.emb_dim if hasattr(self.bert.config, 'emb_dim') else self.bert.config.hidden_size
+        self.punc = nn.Linear(size, 5)
+        self.case = nn.Linear(size, 4)
+        self.dropout = nn.Dropout(0.3)
+        self.to(device)
+    def forward(self, x):
+        output = self.bert(x)
+        representations = self.dropout(F.gelu(output['last_hidden_state']))
+        punc = self.punc(representations)
+        case = self.case(representations)
+        return punc, case
+# randomly create sequences that align to punctuation boundaries
+def drop_at_boundaries(rate, x, y, cls_token_id, sep_token_id, pad_token_id):
+    for i, dropped in enumerate(torch.rand((len(x),)) < rate):
+        if dropped:
+            # select all indices that are sentence endings
+            indices = (y[i,:,0] > 1).nonzero(as_tuple=True)[0]
+            if len(indices) < 2:
+                continue
+            start = indices[0] + 1
+            end = indices[random.randint(1, len(indices) - 1)] + 1
+            length = end - start
+            if length + 2 > len(x[i]):
+                continue
+            x[i, 0] = cls_token_id
+            x[i, 1: length + 1] = x[i, start: end].clone()
+            x[i, length + 1] = sep_token_id
+            x[i, length + 2:] = pad_token_id
+            y[i, 0] = 0
+            y[i, 1: length + 1] = y[i, start: end].clone()
+            y[i, length + 1:] = 0
+def compute_performance(config, model, loader):
+    device = config.device
+    criterion = nn.CrossEntropyLoss()
+    model.eval()
+    total_loss = all_correct1 = all_correct2 = num_loss = num_perf = 0
+    num_ref = collections.defaultdict(float)
+    num_hyp = collections.defaultdict(float)
+    num_correct = collections.defaultdict(float)
+    for x, y in loader:
+        x = x.long().to(device)
+        y = y.long().to(device)
+        y1 = y[:,:,0]
+        y2 = y[:,:,1]
+        with torch.no_grad():
+            y_scores1, y_scores2 = model(x.to(device))
+            loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
+            loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
+            loss = loss1 + loss2
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for label in range(1, 5):
+                ref = (y1 == label)
+                hyp = (y_pred1 == label)
+                correct = (ref * hyp == 1)
+                num_ref[label] += ref.sum()
+                num_hyp[label] += hyp.sum()
+                num_correct[label] += correct.sum()
+                num_ref[0] += ref.sum()
+                num_hyp[0] += hyp.sum()
+                num_correct[0] += correct.sum()
+            all_correct1 += (y_pred1 == y1).sum()
+            all_correct2 += (y_pred2 == y2).sum()
+            total_loss += loss.item()
+            num_loss += len(y)
+            num_perf += len(y) * config.max_length
+    recall = {}
+    precision = {}
+    fscore = {}
+    for label in range(0, 5):
+        recall[label] = num_correct[label] / num_ref[label] if num_ref[label] > 0 else 0
+        precision[label] = num_correct[label] / num_hyp[label] if num_hyp[label] > 0 else 0
+        fscore[label] = (2 * recall[label] * precision[label] / (recall[label] + precision[label])).item() if recall[label] + precision[label] > 0 else 0
+    return total_loss / num_loss, all_correct2.item() / num_perf, all_correct1.item() / num_perf, fscore
+def fit(config, model, checkpoint_path, train_loader, valid_loader, iterations, valid_period=200, lr=1e-5):
+    device = config.device
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=lr)
+    iteration = 0
+    while True:
+        model.train()
+        total_loss = num = 0
+        for x, y in tqdm(train_loader):
+            x = x.long().to(device)
+            y = y.long().to(device)
+            drop_at_boundaries(config.dab_rate, x, y, config.cls_token_id, config.sep_token_id, config.pad_token_id)
+            y1 = y[:,:,0]
+            y2 = y[:,:,1]
+            optimizer.zero_grad()
+            y_scores1, y_scores2 = model(x)
+            loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
+            loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
+            loss = loss1 + loss2
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            num += len(y)
+            if iteration % valid_period == valid_period - 1:
+                train_loss = total_loss / num
+                valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore = compute_performance(config, model, valid_loader)
+                torch.save({
+                    'iteration': iteration + 1,
+                    'model_state_dict': model.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'train_loss': train_loss,
+                    'valid_loss': valid_loss,
+                    'valid_accuracy_case': valid_accuracy_case,
+                    'valid_accuracy_punc': valid_accuracy_punc,
+                    'valid_fscore': valid_fscore,
+                    'config': config.__dict__,
+                }, '%s.%d' % (checkpoint_path, iteration + 1))
+                print(iteration + 1, train_loss, valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore)
+                total_loss = num = 0
+            iteration += 1
+            if iteration > iterations:
+                return
+            sys.stderr.flush()
+            sys.stdout.flush()
+def batchify(max_length, x, y):
+    print (x.shape)
+    print (y.shape)
+    x = x[:(len(x) // max_length) * max_length].reshape(-1, max_length)
+    y = y[:(len(y) // max_length) * max_length, :].reshape(-1, max_length, 2)
+    return x, y
+def train(config, train_x_fn, train_y_fn, valid_x_fn, valid_y_fn, checkpoint_path):
+    X_train, Y_train = batchify(config.max_length, torch.load(train_x_fn), torch.load(train_y_fn))
+    X_valid, Y_valid = batchify(config.max_length, torch.load(valid_x_fn), torch.load(valid_y_fn))
+    train_set = TensorDataset(X_train, Y_train)
+    valid_set = TensorDataset(X_valid, Y_valid)
+    train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
+    valid_loader = DataLoader(valid_set, batch_size=config.batch_size)
+    model = Model(config.flavor, config.device)
+    fit(config, model, checkpoint_path, train_loader, valid_loader, config.updates, config.period, config.lr)
+def run_eval(config, test_x_fn, test_y_fn, checkpoint_path):
+    X_test, Y_test = batchify(config.max_length, torch.load(test_x_fn), torch.load(test_y_fn))
+    test_set = TensorDataset(X_test, Y_test)
+    test_loader = DataLoader(test_set, batch_size=config.batch_size)
+    loaded = torch.load(checkpoint_path, map_location=config.device)
+    if 'config' in loaded:
+        config = Config(**loaded['config'])
+        init(config)
+    model = Model(config.flavor, config.device)
+    model.load_state_dict(loaded['model_state_dict'])
+    print(*compute_performance(config, model, test_loader))
+def recase(token, label):
+    if label == case['LOWER']:
+        return token.lower()
+    elif label == case['CAPITALIZE']:
+        return token.lower().capitalize()
+    elif label == case['UPPER']:
+        return token.upper()
+    else:
+        return token
+class CasePuncPredictor:
+    def __init__(self, checkpoint_path, lang=default_config.lang, flavor=default_config.flavor, device=default_config.device):
+        loaded = torch.load(checkpoint_path, map_location=device if torch.cuda.is_available() else 'cpu')
+        if 'config' in loaded:
+            self.config = Config(**loaded['config'])
+        else:
+            self.config = Config(lang=lang, flavor=flavor, device=device)
+        init(self.config)
+        self.model = Model(self.config.flavor, self.config.device)
+        self.model.load_state_dict(loaded['model_state_dict'])
+        self.model.eval()
+        self.model.to(self.config.device)
+        self.rev_case = {b: a for a, b in case.items()}
+        self.rev_punc = {b: a for a, b in punctuation.items()}
+    def tokenize(self, text):
+        return [self.config.cls_token] + self.config.tokenizer.tokenize(text) + [self.config.sep_token]
+    def predict(self, tokens, getter=lambda x: x):
+        max_length = self.config.max_length
+        device = self.config.device
+        if type(tokens) == str:
+            tokens = self.tokenize(tokens)
+        previous_label = punctuation['PERIOD']
+        for start in range(0, len(tokens), max_length):
+            instance = tokens[start: start + max_length]
+            if type(getter(instance[0])) == str:
+                ids = self.config.tokenizer.convert_tokens_to_ids(getter(token) for token in instance)
+            else:
+                ids = [getter(token) for token in instance]
+            if len(ids) < max_length:
+                ids += [0] * (max_length - len(ids))
+            x = torch.tensor([ids]).long().to(device)
+            y_scores1, y_scores2 = self.model(x)
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for i, id, token, punc_label, case_label in zip(range(len(instance)), ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
+                if id == self.config.cls_token_id or id == self.config.sep_token_id:
+                    continue
+                if previous_label != None and previous_label > 1:
+                    if case_label in [case['LOWER'], case['OTHER']]: # LOWER, OTHER
+                        case_label = case['CAPITALIZE']
+                if i + start == len(tokens) - 2 and punc_label == punctuation['O']:
+                    punc_label = punctuation['PERIOD']
+                yield (token, self.rev_case[case_label], self.rev_punc[punc_label])
+                previous_label = punc_label
+    def map_case_label(self, token, case_label):
+        if token.endswith('</w>'):
+            token = token[:-4]
+        if token.startswith('##'):
+            token = token[2:]
+        return recase(token, case[case_label])
+    def map_punc_label(self, token, punc_label):
+        if token.endswith('</w>'):
+            token = token[:-4]
+        if token.startswith('##'):
+            token = token[2:]
+        return token + punctuation_syms[punctuation[punc_label]]
+def generate_predictions(config, checkpoint_path):
+    loaded = torch.load(checkpoint_path, map_location=config.device if torch.cuda.is_available() else 'cpu')
+    if 'config' in loaded:
+        config = Config(**loaded['config'])
+        init(config)
+    model = Model(config.flavor, config.device)
+    model.load_state_dict(loaded['model_state_dict'])
+    rev_case = {b: a for a, b in case.items()}
+    rev_punc = {b: a for a, b in punctuation.items()}
+    for line in sys.stdin:
+        # also drop punctuation that we may generate
+        line = ''.join([c for c in line if c not in mapped_punctuation])
+        if config.debug:
+            print(line)
+        tokens = [config.cls_token] + config.tokenizer.tokenize(line) + [config.sep_token]
+        if config.debug:
+            print(tokens)
+        previous_label = punctuation['PERIOD']
+        first_time = True
+        was_word = False
+        for start in range(0, len(tokens), config.max_length):
+            instance = tokens[start: start + config.max_length]
+            ids = config.tokenizer.convert_tokens_to_ids(instance)
+            #print(len(ids), file=sys.stderr)
+            if len(ids) < config.max_length:
+                ids += [config.pad_token_id] * (config.max_length - len(ids))
+            x = torch.tensor([ids]).long().to(config.device)
+            y_scores1, y_scores2 = model(x)
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for id, token, punc_label, case_label in zip(ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
+                if config.debug:
+                    print(id, token, punc_label, case_label, file=sys.stderr)
+                if id == config.cls_token_id or id == config.sep_token_id:
+                    continue
+                if previous_label != None and previous_label > 1:
+                    if case_label in [case['LOWER'], case['OTHER']]:
+                        case_label = case['CAPITALIZE']
+                previous_label = punc_label
+                # different strategy due to sub-lexical token encoding in Flaubert
+                if config.lang == 'fr':
+                    if token.endswith('</w>'):
+                        cased_token = recase(token[:-4], case_label)
+                        if was_word:
+                            print(' ', end='')
+                        print(cased_token + punctuation_syms[punc_label], end='')
+                        was_word = True
+                    else:
+                        cased_token = recase(token, case_label)
+                        if was_word:
+                            print(' ', end='')
+                        print(cased_token, end='')
+                        was_word = False
+                else:
+                    if token.startswith('##'):
+                        cased_token = recase(token[2:], case_label)
+                        print(cased_token, end='')
+                    else:
+                        cased_token = recase(token, case_label)
+                        if not first_time:
+                            print(' ', end='')
+                        first_time = False
+                        print(cased_token + punctuation_syms[punc_label], end='')
+        if previous_label == 0:
+            print('.', end='')
+        print()
+def label_for_case(token):
+    token = re.sub('[^\p{Han}\p{Ll}\p{Lu}]', '', token)
+    if token == token.lower():
+        return 'LOWER'
+    elif token == token.lower().capitalize():
+        return 'CAPITALIZE'
+    elif token == token.upper():
+        return 'UPPER'
+    else:
+        return 'OTHER'
+def make_tensors(config, input_fn, output_x_fn, output_y_fn):
+    # count file lines without loading them
+    size = 0
+    with open(input_fn) as fp:
+        for line in fp:
+            size += 1
+    with open(input_fn) as fp:
+        X = torch.IntTensor(size)
+        Y = torch.ByteTensor(size, 2)
+        offset = 0
+        for n, line in enumerate(fp):
+            word, case_label, punc_label = line.strip().split('\t')
+            id = config.tokenizer.convert_tokens_to_ids(word)
+            if config.debug:
+                assert word.lower() == tokenizer.convert_ids_to_tokens(id)
+            X[offset] = id
+            Y[offset, 0] = punctuation[punc_label]
+            Y[offset, 1] = case[case_label]
+            offset += 1
+        torch.save(X, output_x_fn)
+        torch.save(Y, output_y_fn)
+mapped_punctuation = {
+    '.': 'PERIOD',
+    '...': 'PERIOD',
+    ',': 'COMMA',
+    ';': 'COMMA',
+    ':': 'COMMA',
+    '(': 'COMMA',
+    ')': 'COMMA',
+    '?': 'QUESTION',
+    '!': 'EXCLAMATION',
+    '，': 'COMMA',
+    '！': 'EXCLAMATION',
+    '？': 'QUESTION',
+    '；': 'COMMA',
+    '：': 'COMMA',
+    '（': 'COMMA',
+    '(': 'COMMA',
+    '）': 'COMMA',
+    '［': 'COMMA',
+    '］': 'COMMA',
+    '【': 'COMMA',
+    '】': 'COMMA',
+    '└': 'COMMA',
+    '└ ': 'COMMA',
+    '_': 'O',
+    '。': 'PERIOD',
+    '、': 'COMMA', # enumeration comma
+    '、': 'COMMA',
+    '…': 'PERIOD',
+    '—': 'COMMA',
+    '「': 'COMMA',
+    '」': 'COMMA',
+    '．': 'PERIOD',
+    '《': 'O',
+    '》': 'O',
+    '，': 'COMMA',
+    '“': 'O',
+    '”': 'O',
+    '"': 'O',
+    '-': 'O',
+    '-': 'O',
+    '〉': 'COMMA',
+    '〈': 'COMMA',
+    '↑': 'O',
+    '〔': 'COMMA',
+    '〕': 'COMMA',
+}
+def preprocess_text(config, max_token_count=-1):
+    global num_tokens_output
+    max_token_count = int(max_token_count)
+    num_tokens_output = 0
+    def process_segment(text, punctuation):
+        global num_tokens_output
+        text = text.replace('\t', ' ')
+        tokens = config.tokenizer.tokenize(text)
+        for i, token in enumerate(tokens):
+            case_label = label_for_case(token)
+            if i == len(tokens) - 1:
+                print(token.lower(), case_label, punctuation, sep='\t')
+            else:
+                print(token.lower(), case_label, 'O', sep='\t')
+            num_tokens_output += 1
+            # a bit too ugly, but alternative is to throw an exception
+            if max_token_count > 0 and num_tokens_output >= max_token_count:
+                sys.exit(0)
+    for line in sys.stdin:
+        line = line.strip()
+        if line != '':
+            line = unicodedata.normalize("NFC", line)
+            if config.debug:
+                print(line)
+            start = 0
+            for i, char in enumerate(line):
+                if char in mapped_punctuation:
+                    if i > start and line[start: i].strip() != '':
+                        process_segment(line[start: i], mapped_punctuation[char])
+                    start = i + 1
+            if start < len(line):
+                process_segment(line[start:], 'PERIOD')
+def preprocess_text_old_fr(config):
+    assert config.lang == 'fr'
+    splitsents = MosesSentenceSplitter(lang)
+    tokenize = MosesTokenizer(lang, extra=['-no-escape'])
+    normalize = MosesPunctuationNormalizer(lang)
+    for line in sys.stdin:
+        if line.strip() != '':
+            for sentence in splitsents([normalize(line)]):
+                tokens = tokenize(sentence)
+                previous_token = None
+                for token in tokens:
+                    if token in mapped_punctuation:
+                        if previous_token != None:
+                            print(previous_token, mapped_punctuation[token], sep='\t')
+                        previous_token = None
+                    elif not re.search('[\p{Han}\p{Ll}\p{Lu}\d]', token): # remove non-alphanumeric tokens
+                        continue
+                    else:
+                        if previous_token != None:
+                            print(previous_token, 'O', sep='\t')
+                        previous_token = token
+                if previous_token != None:
+                    print(previous_token, 'PERIOD', sep='\t')
+# modification of the wordpiece tokenizer to keep case information even if vocab is lower cased
+# forked from https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100, keep_case=True):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+        self.keep_case = keep_case
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in text.strip().split():
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    # optionaly lowercase substring before checking for inclusion in vocab
+                    if (self.keep_case and substr.lower() in self.vocab) or (substr in self.vocab):
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+# modification of XLM bpe tokenizer for keeping case information when vocab is lowercase
+# forked from https://github.com/huggingface/transformers/blob/cd56f3fe7eae4a53a9880e3f5e8f91877a78271c/src/transformers/models/xlm/tokenization_xlm.py
+def bpe(self, token):
+    def to_lower(pair):
+      #print('  ',pair)
+      return (pair[0].lower(), pair[1].lower())
+    from transformers.models.xlm.tokenization_xlm import get_pairs
+    word = tuple(token[:-1]) + (token[-1] + "</w>",)
+    if token in self.cache:
+        return self.cache[token]
+    pairs = get_pairs(word)
+    if not pairs:
+        return token + "</w>"
+    while True:
+        bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(to_lower(pair), float("inf")))
+        #print(bigram)
+        if to_lower(bigram) not in self.bpe_ranks:
+            break
+        first, second = bigram
+        new_word = []
+        i = 0
+        while i < len(word):
+            try:
+                j = word.index(first, i)
+            except ValueError:
+                new_word.extend(word[i:])
+                break
+            else:
+                new_word.extend(word[i:j])
+                i = j
+            if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                new_word.append(first + second)
+                i += 2
+            else:
+                new_word.append(word[i])
+                i += 1
+        new_word = tuple(new_word)
+        word = new_word
+        if len(word) == 1:
+            break
+        else:
+            pairs = get_pairs(word)
+    word = " ".join(word)
+    if word == "\n  </w>":
+        word = "\n</w>"
+    self.cache[token] = word
+    return word
+def init(config):
+    init_random(config.seed)
+    if config.lang == 'fr':
+        config.tokenizer = tokenizer = AutoTokenizer.from_pretrained(config.flavor, do_lower_case=False)
+        from transformers.models.xlm.tokenization_xlm import XLMTokenizer
+        assert isinstance(tokenizer, XLMTokenizer)
+        # monkey patch XLM tokenizer
+        import types
+        tokenizer.bpe = types.MethodType(bpe, tokenizer)
+    else:
+        # warning: needs to be BertTokenizer for monkey patching to work
+        config.tokenizer = tokenizer = BertTokenizer.from_pretrained(config.flavor, do_lower_case=False)
+        # warning: monkey patch tokenizer to keep case information
+        #from recasing_tokenizer import WordpieceTokenizer
+        config.tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token)
+    if config.lang == 'fr':
+        config.pad_token_id = tokenizer.pad_token_id
+        config.cls_token_id = tokenizer.bos_token_id
+        config.cls_token = tokenizer.bos_token
+        config.sep_token_id = tokenizer.sep_token_id
+        config.sep_token = tokenizer.sep_token
+    else:
+        config.pad_token_id = tokenizer.pad_token_id
+        config.cls_token_id = tokenizer.cls_token_id
+        config.cls_token = tokenizer.cls_token
+        config.sep_token_id = tokenizer.sep_token_id
+        config.sep_token = tokenizer.sep_token
+    if not torch.cuda.is_available() and config.device == 'cuda':
+        print('WARNING: reverting to cpu as cuda is not available', file=sys.stderr)
+    config.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')
+def main(config, action, args):
+    init(config)
+    if action == 'train':
+        train(config, *args)
+    elif action == 'eval':
+        run_eval(config, *args)
+    elif action == 'predict':
+        generate_predictions(config, *args)
+    elif action == 'tensorize':
+        make_tensors(config, *args)
+    elif action == 'preprocess':
+        preprocess_text(config, *args)
+    else:
+        print('invalid action "%s"' % action)
+        sys.exit(1)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("action", help="train|eval|predict|tensorize|preprocess", type=str)
+    parser.add_argument("action_args", help="arguments for selected action", type=str, nargs='*')
+    parser.add_argument("--seed", help="random seed", default=default_config.seed, type=int)
+    parser.add_argument("--lang", help="language (fr, en, zh)", default=default_config.lang, type=str)
+    parser.add_argument("--flavor", help="bert flavor in transformers model zoo", default=default_config.flavor, type=str)
+    parser.add_argument("--max-length", help="maximum input length", default=default_config.max_length, type=int)
+    parser.add_argument("--batch-size", help="size of batches", default=default_config.batch_size, type=int)
+    parser.add_argument("--device", help="computation device (cuda, cpu)", default=default_config.device, type=str)
+    parser.add_argument("--debug", help="whether to output more debug info", default=default_config.debug, type=bool)
+    parser.add_argument("--updates", help="number of training updates to perform", default=default_config.updates, type=bool)
+    parser.add_argument("--period", help="validation period in updates", default=default_config.period, type=bool)
+    parser.add_argument("--lr", help="learning rate", default=default_config.lr, type=bool)
+    parser.add_argument("--dab-rate", help="drop at boundaries rate", default=default_config.dab_rate, type=bool)
+    config = Config(**parser.parse_args().__dict__)
+    main(config, config.action, config.action_args)

punctuation/vosk-recasepunc-en-0.22.7z ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d754b827d2b3f85fe56cfb7a5262dc5658a9257ff7f9404e57595328882b8777
+size 1148483511

punctuation/vosk-recasepunc-en-0.22/README ADDED Viewed

	@@ -0,0 +1,7 @@

+1. Install pytorch and transformers:
+    pip3 install transformers
+2. Run python3 example.py de-test.txt
+3. Compare with de-test.txt.orig

punctuation/vosk-recasepunc-en-0.22/checkpoint ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9782ccd13a130feffb13609834778421ebd39e26910d25ddcf2185a0eea75935
+size 1310193349

punctuation/vosk-recasepunc-en-0.22/example.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import sys
+import time
+from transformers import logging
+from recasepunc import CasePuncPredictor
+from recasepunc import WordpieceTokenizer
+from recasepunc import Config
+logging.set_verbosity_error()
+predictor = CasePuncPredictor('checkpoint', lang="en")
+text = " ".join(open(sys.argv[1]).readlines())
+tokens = list(enumerate(predictor.tokenize(text)))
+results = ""
+for token, case_label, punc_label in predictor.predict(tokens, lambda x: x[1]):
+    prediction = predictor.map_punc_label(predictor.map_case_label(token[1], case_label), punc_label)
+    if token[1][0] == '\'' or (len(results) > 0 and results[-1] == '\''):
+       results = results + prediction
+    elif token[1][0] != '#':
+       results = results + ' ' + prediction
+    else:
+       results = results + prediction
+print (results.strip())

punctuation/vosk-recasepunc-en-0.22/recasepunc.py ADDED Viewed

	@@ -0,0 +1,742 @@

+import sys
+import collections
+import os
+import regex as re
+#from mosestokenizer import *
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import random
+import unicodedata
+import numpy as np
+import argparse
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel, AutoTokenizer, BertTokenizer
+default_config = argparse.Namespace(
+    seed=871253,
+    lang='en',
+    #flavor='flaubert/flaubert_base_uncased',
+    flavor=None,
+    max_length=256,
+    batch_size=16,
+    updates=24000,
+    period=1000,
+    lr=1e-5,
+    dab_rate=0.1,
+    device='cuda',
+    debug=False
+)
+default_flavors = {
+    'fr': 'flaubert/flaubert_base_uncased',
+    'en': 'bert-base-uncased',
+    'zh': 'ckiplab/bert-base-chinese',
+    'tr': 'dbmdz/bert-base-turkish-uncased',
+    'de': 'dbmdz/bert-base-german-uncased',
+    'pt': 'neuralmind/bert-base-portuguese-cased'
+}
+class Config(argparse.Namespace):
+    def __init__(self, **kwargs):
+        for key, value in default_config.__dict__.items():
+            setattr(self, key, value)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        assert self.lang in ['fr', 'en', 'zh', 'tr', 'pt', 'de']
+        if 'lang' in kwargs and ('flavor' not in kwargs or kwargs['flavor'] is None):
+            self.flavor = default_flavors[self.lang]
+        #print(self.lang, self.flavor)
+def init_random(seed):
+    # make sure everything is deterministic
+    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+    #torch.use_deterministic_algorithms(True)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+# NOTE: it is assumed in the implementation that y[:,0] is the punctuation label, and y[:,1] is the case label!
+punctuation = {
+    'O': 0,
+    'COMMA': 1,
+    'PERIOD': 2,
+    'QUESTION': 3,
+    'EXCLAMATION': 4,
+}
+punctuation_syms = ['', ',', '.', ' ?', ' !']
+case = {
+    'LOWER': 0,
+    'UPPER': 1,
+    'CAPITALIZE': 2,
+    'OTHER': 3,
+}
+class Model(nn.Module):
+    def __init__(self, flavor, device):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(flavor)
+        # need a proper way of determining representation size
+        size = self.bert.dim if hasattr(self.bert, 'dim') else self.bert.config.pooler_fc_size if hasattr(self.bert.config, 'pooler_fc_size') else self.bert.config.emb_dim if hasattr(self.bert.config, 'emb_dim') else self.bert.config.hidden_size
+        self.punc = nn.Linear(size, 5)
+        self.case = nn.Linear(size, 4)
+        self.dropout = nn.Dropout(0.3)
+        self.to(device)
+    def forward(self, x):
+        output = self.bert(x)
+        representations = self.dropout(F.gelu(output['last_hidden_state']))
+        punc = self.punc(representations)
+        case = self.case(representations)
+        return punc, case
+# randomly create sequences that align to punctuation boundaries
+def drop_at_boundaries(rate, x, y, cls_token_id, sep_token_id, pad_token_id):
+    for i, dropped in enumerate(torch.rand((len(x),)) < rate):
+        if dropped:
+            # select all indices that are sentence endings
+            indices = (y[i,:,0] > 1).nonzero(as_tuple=True)[0]
+            if len(indices) < 2:
+                continue
+            start = indices[0] + 1
+            end = indices[random.randint(1, len(indices) - 1)] + 1
+            length = end - start
+            if length + 2 > len(x[i]):
+                continue
+            x[i, 0] = cls_token_id
+            x[i, 1: length + 1] = x[i, start: end].clone()
+            x[i, length + 1] = sep_token_id
+            x[i, length + 2:] = pad_token_id
+            y[i, 0] = 0
+            y[i, 1: length + 1] = y[i, start: end].clone()
+            y[i, length + 1:] = 0
+def compute_performance(config, model, loader):
+    device = config.device
+    criterion = nn.CrossEntropyLoss()
+    model.eval()
+    total_loss = all_correct1 = all_correct2 = num_loss = num_perf = 0
+    num_ref = collections.defaultdict(float)
+    num_hyp = collections.defaultdict(float)
+    num_correct = collections.defaultdict(float)
+    for x, y in loader:
+        x = x.long().to(device)
+        y = y.long().to(device)
+        y1 = y[:,:,0]
+        y2 = y[:,:,1]
+        with torch.no_grad():
+            y_scores1, y_scores2 = model(x.to(device))
+            loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
+            loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
+            loss = loss1 + loss2
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for label in range(1, 5):
+                ref = (y1 == label)
+                hyp = (y_pred1 == label)
+                correct = (ref * hyp == 1)
+                num_ref[label] += ref.sum()
+                num_hyp[label] += hyp.sum()
+                num_correct[label] += correct.sum()
+                num_ref[0] += ref.sum()
+                num_hyp[0] += hyp.sum()
+                num_correct[0] += correct.sum()
+            all_correct1 += (y_pred1 == y1).sum()
+            all_correct2 += (y_pred2 == y2).sum()
+            total_loss += loss.item()
+            num_loss += len(y)
+            num_perf += len(y) * config.max_length
+    recall = {}
+    precision = {}
+    fscore = {}
+    for label in range(0, 5):
+        recall[label] = num_correct[label] / num_ref[label] if num_ref[label] > 0 else 0
+        precision[label] = num_correct[label] / num_hyp[label] if num_hyp[label] > 0 else 0
+        fscore[label] = (2 * recall[label] * precision[label] / (recall[label] + precision[label])).item() if recall[label] + precision[label] > 0 else 0
+    return total_loss / num_loss, all_correct2.item() / num_perf, all_correct1.item() / num_perf, fscore
+def fit(config, model, checkpoint_path, train_loader, valid_loader, iterations, valid_period=200, lr=1e-5):
+    device = config.device
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=lr)
+    iteration = 0
+    while True:
+        model.train()
+        total_loss = num = 0
+        for x, y in tqdm(train_loader):
+            x = x.long().to(device)
+            y = y.long().to(device)
+            drop_at_boundaries(config.dab_rate, x, y, config.cls_token_id, config.sep_token_id, config.pad_token_id)
+            y1 = y[:,:,0]
+            y2 = y[:,:,1]
+            optimizer.zero_grad()
+            y_scores1, y_scores2 = model(x)
+            loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
+            loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
+            loss = loss1 + loss2
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            num += len(y)
+            if iteration % valid_period == valid_period - 1:
+                train_loss = total_loss / num
+                valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore = compute_performance(config, model, valid_loader)
+                torch.save({
+                    'iteration': iteration + 1,
+                    'model_state_dict': model.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'train_loss': train_loss,
+                    'valid_loss': valid_loss,
+                    'valid_accuracy_case': valid_accuracy_case,
+                    'valid_accuracy_punc': valid_accuracy_punc,
+                    'valid_fscore': valid_fscore,
+                    'config': config.__dict__,
+                }, '%s.%d' % (checkpoint_path, iteration + 1))
+                print(iteration + 1, train_loss, valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore)
+                total_loss = num = 0
+            iteration += 1
+            if iteration > iterations:
+                return
+            sys.stderr.flush()
+            sys.stdout.flush()
+def batchify(max_length, x, y):
+    print (x.shape)
+    print (y.shape)
+    x = x[:(len(x) // max_length) * max_length].reshape(-1, max_length)
+    y = y[:(len(y) // max_length) * max_length, :].reshape(-1, max_length, 2)
+    return x, y
+def train(config, train_x_fn, train_y_fn, valid_x_fn, valid_y_fn, checkpoint_path):
+    X_train, Y_train = batchify(config.max_length, torch.load(train_x_fn), torch.load(train_y_fn))
+    X_valid, Y_valid = batchify(config.max_length, torch.load(valid_x_fn), torch.load(valid_y_fn))
+    train_set = TensorDataset(X_train, Y_train)
+    valid_set = TensorDataset(X_valid, Y_valid)
+    train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
+    valid_loader = DataLoader(valid_set, batch_size=config.batch_size)
+    model = Model(config.flavor, config.device)
+    fit(config, model, checkpoint_path, train_loader, valid_loader, config.updates, config.period, config.lr)
+def run_eval(config, test_x_fn, test_y_fn, checkpoint_path):
+    X_test, Y_test = batchify(config.max_length, torch.load(test_x_fn), torch.load(test_y_fn))
+    test_set = TensorDataset(X_test, Y_test)
+    test_loader = DataLoader(test_set, batch_size=config.batch_size)
+    loaded = torch.load(checkpoint_path, map_location=config.device)
+    if 'config' in loaded:
+        config = Config(**loaded['config'])
+        init(config)
+    model = Model(config.flavor, config.device)
+    model.load_state_dict(loaded['model_state_dict'])
+    print(*compute_performance(config, model, test_loader))
+def recase(token, label):
+    if label == case['LOWER']:
+        return token.lower()
+    elif label == case['CAPITALIZE']:
+        return token.lower().capitalize()
+    elif label == case['UPPER']:
+        return token.upper()
+    else:
+        return token
+class CasePuncPredictor:
+    def __init__(self, checkpoint_path, lang=default_config.lang, flavor=default_config.flavor, device=default_config.device):
+        loaded = torch.load(checkpoint_path, map_location=device if torch.cuda.is_available() else 'cpu')
+        if 'config' in loaded:
+            self.config = Config(**loaded['config'])
+        else:
+            self.config = Config(lang=lang, flavor=flavor, device=device)
+        init(self.config)
+        self.model = Model(self.config.flavor, self.config.device)
+        self.model.load_state_dict(loaded['model_state_dict'])
+        self.model.eval()
+        self.model.to(self.config.device)
+        self.rev_case = {b: a for a, b in case.items()}
+        self.rev_punc = {b: a for a, b in punctuation.items()}
+    def tokenize(self, text):
+        return [self.config.cls_token] + self.config.tokenizer.tokenize(text) + [self.config.sep_token]
+    def predict(self, tokens, getter=lambda x: x):
+        max_length = self.config.max_length
+        device = self.config.device
+        if type(tokens) == str:
+            tokens = self.tokenize(tokens)
+        previous_label = punctuation['PERIOD']
+        for start in range(0, len(tokens), max_length):
+            instance = tokens[start: start + max_length]
+            if type(getter(instance[0])) == str:
+                ids = self.config.tokenizer.convert_tokens_to_ids(getter(token) for token in instance)
+            else:
+                ids = [getter(token) for token in instance]
+            if len(ids) < max_length:
+                ids += [0] * (max_length - len(ids))
+            x = torch.tensor([ids]).long().to(device)
+            y_scores1, y_scores2 = self.model(x)
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for i, id, token, punc_label, case_label in zip(range(len(instance)), ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
+                if id == self.config.cls_token_id or id == self.config.sep_token_id:
+                    continue
+                if previous_label != None and previous_label > 1:
+                    if case_label in [case['LOWER'], case['OTHER']]: # LOWER, OTHER
+                        case_label = case['CAPITALIZE']
+                if i + start == len(tokens) - 2 and punc_label == punctuation['O']:
+                    punc_label = punctuation['PERIOD']
+                yield (token, self.rev_case[case_label], self.rev_punc[punc_label])
+                previous_label = punc_label
+    def map_case_label(self, token, case_label):
+        if token.endswith('</w>'):
+            token = token[:-4]
+        if token.startswith('##'):
+            token = token[2:]
+        return recase(token, case[case_label])
+    def map_punc_label(self, token, punc_label):
+        if token.endswith('</w>'):
+            token = token[:-4]
+        if token.startswith('##'):
+            token = token[2:]
+        return token + punctuation_syms[punctuation[punc_label]]
+def generate_predictions(config, checkpoint_path):
+    loaded = torch.load(checkpoint_path, map_location=config.device if torch.cuda.is_available() else 'cpu')
+    if 'config' in loaded:
+        config = Config(**loaded['config'])
+        init(config)
+    model = Model(config.flavor, config.device)
+    model.load_state_dict(loaded['model_state_dict'])
+    rev_case = {b: a for a, b in case.items()}
+    rev_punc = {b: a for a, b in punctuation.items()}
+    for line in sys.stdin:
+        # also drop punctuation that we may generate
+        line = ''.join([c for c in line if c not in mapped_punctuation])
+        if config.debug:
+            print(line)
+        tokens = [config.cls_token] + config.tokenizer.tokenize(line) + [config.sep_token]
+        if config.debug:
+            print(tokens)
+        previous_label = punctuation['PERIOD']
+        first_time = True
+        was_word = False
+        for start in range(0, len(tokens), config.max_length):
+            instance = tokens[start: start + config.max_length]
+            ids = config.tokenizer.convert_tokens_to_ids(instance)
+            #print(len(ids), file=sys.stderr)
+            if len(ids) < config.max_length:
+                ids += [config.pad_token_id] * (config.max_length - len(ids))
+            x = torch.tensor([ids]).long().to(config.device)
+            y_scores1, y_scores2 = model(x)
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for id, token, punc_label, case_label in zip(ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
+                if config.debug:
+                    print(id, token, punc_label, case_label, file=sys.stderr)
+                if id == config.cls_token_id or id == config.sep_token_id:
+                    continue
+                if previous_label != None and previous_label > 1:
+                    if case_label in [case['LOWER'], case['OTHER']]:
+                        case_label = case['CAPITALIZE']
+                previous_label = punc_label
+                # different strategy due to sub-lexical token encoding in Flaubert
+                if config.lang == 'fr':
+                    if token.endswith('</w>'):
+                        cased_token = recase(token[:-4], case_label)
+                        if was_word:
+                            print(' ', end='')
+                        print(cased_token + punctuation_syms[punc_label], end='')
+                        was_word = True
+                    else:
+                        cased_token = recase(token, case_label)
+                        if was_word:
+                            print(' ', end='')
+                        print(cased_token, end='')
+                        was_word = False
+                else:
+                    if token.startswith('##'):
+                        cased_token = recase(token[2:], case_label)
+                        print(cased_token, end='')
+                    else:
+                        cased_token = recase(token, case_label)
+                        if not first_time:
+                            print(' ', end='')
+                        first_time = False
+                        print(cased_token + punctuation_syms[punc_label], end='')
+        if previous_label == 0:
+            print('.', end='')
+        print()
+def label_for_case(token):
+    token = re.sub('[^\p{Han}\p{Ll}\p{Lu}]', '', token)
+    if token == token.lower():
+        return 'LOWER'
+    elif token == token.lower().capitalize():
+        return 'CAPITALIZE'
+    elif token == token.upper():
+        return 'UPPER'
+    else:
+        return 'OTHER'
+def make_tensors(config, input_fn, output_x_fn, output_y_fn):
+    # count file lines without loading them
+    size = 0
+    with open(input_fn) as fp:
+        for line in fp:
+            size += 1
+    with open(input_fn) as fp:
+        X = torch.IntTensor(size)
+        Y = torch.ByteTensor(size, 2)
+        offset = 0
+        for n, line in enumerate(fp):
+            word, case_label, punc_label = line.strip().split('\t')
+            id = config.tokenizer.convert_tokens_to_ids(word)
+            if config.debug:
+                assert word.lower() == tokenizer.convert_ids_to_tokens(id)
+            X[offset] = id
+            Y[offset, 0] = punctuation[punc_label]
+            Y[offset, 1] = case[case_label]
+            offset += 1
+        torch.save(X, output_x_fn)
+        torch.save(Y, output_y_fn)
+mapped_punctuation = {
+    '.': 'PERIOD',
+    '...': 'PERIOD',
+    ',': 'COMMA',
+    ';': 'COMMA',
+    ':': 'COMMA',
+    '(': 'COMMA',
+    ')': 'COMMA',
+    '?': 'QUESTION',
+    '!': 'EXCLAMATION',
+    '，': 'COMMA',
+    '！': 'EXCLAMATION',
+    '？': 'QUESTION',
+    '；': 'COMMA',
+    '：': 'COMMA',
+    '（': 'COMMA',
+    '(': 'COMMA',
+    '）': 'COMMA',
+    '［': 'COMMA',
+    '］': 'COMMA',
+    '【': 'COMMA',
+    '】': 'COMMA',
+    '└': 'COMMA',
+    '└ ': 'COMMA',
+    '_': 'O',
+    '。': 'PERIOD',
+    '、': 'COMMA', # enumeration comma
+    '、': 'COMMA',
+    '…': 'PERIOD',
+    '—': 'COMMA',
+    '「': 'COMMA',
+    '」': 'COMMA',
+    '．': 'PERIOD',
+    '《': 'O',
+    '》': 'O',
+    '，': 'COMMA',
+    '“': 'O',
+    '”': 'O',
+    '"': 'O',
+    '-': 'O',
+    '-': 'O',
+    '〉': 'COMMA',
+    '〈': 'COMMA',
+    '↑': 'O',
+    '〔': 'COMMA',
+    '〕': 'COMMA',
+}
+def preprocess_text(config, max_token_count=-1):
+    global num_tokens_output
+    max_token_count = int(max_token_count)
+    num_tokens_output = 0
+    def process_segment(text, punctuation):
+        global num_tokens_output
+        text = text.replace('\t', ' ')
+        tokens = config.tokenizer.tokenize(text)
+        for i, token in enumerate(tokens):
+            case_label = label_for_case(token)
+            if i == len(tokens) - 1:
+                print(token.lower(), case_label, punctuation, sep='\t')
+            else:
+                print(token.lower(), case_label, 'O', sep='\t')
+            num_tokens_output += 1
+            # a bit too ugly, but alternative is to throw an exception
+            if max_token_count > 0 and num_tokens_output >= max_token_count:
+                sys.exit(0)
+    for line in sys.stdin:
+        line = line.strip()
+        if line != '':
+            line = unicodedata.normalize("NFC", line)
+            if config.debug:
+                print(line)
+            start = 0
+            for i, char in enumerate(line):
+                if char in mapped_punctuation:
+                    if i > start and line[start: i].strip() != '':
+                        process_segment(line[start: i], mapped_punctuation[char])
+                    start = i + 1
+            if start < len(line):
+                process_segment(line[start:], 'PERIOD')
+def preprocess_text_old_fr(config):
+    assert config.lang == 'fr'
+    splitsents = MosesSentenceSplitter(lang)
+    tokenize = MosesTokenizer(lang, extra=['-no-escape'])
+    normalize = MosesPunctuationNormalizer(lang)
+    for line in sys.stdin:
+        if line.strip() != '':
+            for sentence in splitsents([normalize(line)]):
+                tokens = tokenize(sentence)
+                previous_token = None
+                for token in tokens:
+                    if token in mapped_punctuation:
+                        if previous_token != None:
+                            print(previous_token, mapped_punctuation[token], sep='\t')
+                        previous_token = None
+                    elif not re.search('[\p{Han}\p{Ll}\p{Lu}\d]', token): # remove non-alphanumeric tokens
+                        continue
+                    else:
+                        if previous_token != None:
+                            print(previous_token, 'O', sep='\t')
+                        previous_token = token
+                if previous_token != None:
+                    print(previous_token, 'PERIOD', sep='\t')
+# modification of the wordpiece tokenizer to keep case information even if vocab is lower cased
+# forked from https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100, keep_case=True):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+        self.keep_case = keep_case
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in text.strip().split():
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    # optionaly lowercase substring before checking for inclusion in vocab
+                    if (self.keep_case and substr.lower() in self.vocab) or (substr in self.vocab):
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+# modification of XLM bpe tokenizer for keeping case information when vocab is lowercase
+# forked from https://github.com/huggingface/transformers/blob/cd56f3fe7eae4a53a9880e3f5e8f91877a78271c/src/transformers/models/xlm/tokenization_xlm.py
+def bpe(self, token):
+    def to_lower(pair):
+      #print('  ',pair)
+      return (pair[0].lower(), pair[1].lower())
+    from transformers.models.xlm.tokenization_xlm import get_pairs
+    word = tuple(token[:-1]) + (token[-1] + "</w>",)
+    if token in self.cache:
+        return self.cache[token]
+    pairs = get_pairs(word)
+    if not pairs:
+        return token + "</w>"
+    while True:
+        bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(to_lower(pair), float("inf")))
+        #print(bigram)
+        if to_lower(bigram) not in self.bpe_ranks:
+            break
+        first, second = bigram
+        new_word = []
+        i = 0
+        while i < len(word):
+            try:
+                j = word.index(first, i)
+            except ValueError:
+                new_word.extend(word[i:])
+                break
+            else:
+                new_word.extend(word[i:j])
+                i = j
+            if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                new_word.append(first + second)
+                i += 2
+            else:
+                new_word.append(word[i])
+                i += 1
+        new_word = tuple(new_word)
+        word = new_word
+        if len(word) == 1:
+            break
+        else:
+            pairs = get_pairs(word)
+    word = " ".join(word)
+    if word == "\n  </w>":
+        word = "\n</w>"
+    self.cache[token] = word
+    return word
+def init(config):
+    init_random(config.seed)
+    if config.lang == 'fr':
+        config.tokenizer = tokenizer = AutoTokenizer.from_pretrained(config.flavor, do_lower_case=False)
+        from transformers.models.xlm.tokenization_xlm import XLMTokenizer
+        assert isinstance(tokenizer, XLMTokenizer)
+        # monkey patch XLM tokenizer
+        import types
+        tokenizer.bpe = types.MethodType(bpe, tokenizer)
+    else:
+        # warning: needs to be BertTokenizer for monkey patching to work
+        config.tokenizer = tokenizer = BertTokenizer.from_pretrained(config.flavor, do_lower_case=False)
+        # warning: monkey patch tokenizer to keep case information
+        #from recasing_tokenizer import WordpieceTokenizer
+        config.tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token)
+    if config.lang == 'fr':
+        config.pad_token_id = tokenizer.pad_token_id
+        config.cls_token_id = tokenizer.bos_token_id
+        config.cls_token = tokenizer.bos_token
+        config.sep_token_id = tokenizer.sep_token_id
+        config.sep_token = tokenizer.sep_token
+    else:
+        config.pad_token_id = tokenizer.pad_token_id
+        config.cls_token_id = tokenizer.cls_token_id
+        config.cls_token = tokenizer.cls_token
+        config.sep_token_id = tokenizer.sep_token_id
+        config.sep_token = tokenizer.sep_token
+    if not torch.cuda.is_available() and config.device == 'cuda':
+        print('WARNING: reverting to cpu as cuda is not available', file=sys.stderr)
+    config.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')
+def main(config, action, args):
+    init(config)
+    if action == 'train':
+        train(config, *args)
+    elif action == 'eval':
+        run_eval(config, *args)
+    elif action == 'predict':
+        generate_predictions(config, *args)
+    elif action == 'tensorize':
+        make_tensors(config, *args)
+    elif action == 'preprocess':
+        preprocess_text(config, *args)
+    else:
+        print('invalid action "%s"' % action)
+        sys.exit(1)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("action", help="train|eval|predict|tensorize|preprocess", type=str)
+    parser.add_argument("action_args", help="arguments for selected action", type=str, nargs='*')
+    parser.add_argument("--seed", help="random seed", default=default_config.seed, type=int)
+    parser.add_argument("--lang", help="language (fr, en, zh)", default=default_config.lang, type=str)
+    parser.add_argument("--flavor", help="bert flavor in transformers model zoo", default=default_config.flavor, type=str)
+    parser.add_argument("--max-length", help="maximum input length", default=default_config.max_length, type=int)
+    parser.add_argument("--batch-size", help="size of batches", default=default_config.batch_size, type=int)
+    parser.add_argument("--device", help="computation device (cuda, cpu)", default=default_config.device, type=str)
+    parser.add_argument("--debug", help="whether to output more debug info", default=default_config.debug, type=bool)
+    parser.add_argument("--updates", help="number of training updates to perform", default=default_config.updates, type=bool)
+    parser.add_argument("--period", help="validation period in updates", default=default_config.period, type=bool)
+    parser.add_argument("--lr", help="learning rate", default=default_config.lr, type=bool)
+    parser.add_argument("--dab-rate", help="drop at boundaries rate", default=default_config.dab_rate, type=bool)
+    config = Config(**parser.parse_args().__dict__)
+    main(config, config.action, config.action_args)

punctuation/vosk-recasepunc-en-0.22/vosk-adapted.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+the
+the
+the beijing and shanghai welcome to the market strata open i'm yvonne good morning and i'm david ingles counting down of course the diablo trade on the chinese
+mainland here in hong kong let's get your top stories today taper and a timetable dominating the latest fed minutes as official debates the exit path meanwhile i got beijing heading the other way hinting at the first triple r cut in more than a year and after the didi debacle here china may move to close a loophole long used
+by companies to take their listings abroad all to enhance that was a horrible mistake council yesterday from china as a maybe it's time to cut the triple r to help them with small businesses they are struggling from the rise of raw material costs the key question is how likely is this yeah what they say it chances are likely it's probably going to be up yet
+the fact that they're saying it might actually already mean we're getting some sentiment coming through in terms of an improved material tracker ten year yield we'll get to that in just a moment in china we're now flirting with the three percent level equity markets futures are pointing up as you can see here in china though broadly speaking though we're down for a seven day across asia seventh day in the last excuse me
+in the last eight sessions here have little commodity markets we're stabilising across your oil or oil prices we're still down five six per cent from highs though as far as that is concerned fx markets your story is guys can we change the police are we're looking at generally speaking the dollar that's very much in focus here so you look at that against the euro you look at that
+against the chinese currency twenty four hours ago who would have thought we were talking about this sort of more divergence and starker labour discord between where you are in a pboc to easily in the fed and very quickly we alluded to this of course if one three percent on your chinese ten year yield and we're not one point three percent lower and lower
+yields there is a charge for you china's top us ten year yield is at the bottom yeah the chinatown area lowest since we saw last year of september yup
+yeah it is a really big major shift in china's central bank policy that's the key question could it be coming of course let's flash out that into what we heard from the cabinet there raising the possibility of a cut to the reserve requirement ratio to both the economy at the same time we also from a former pboc official sheng songcheng said the central bank should actually
+cut rates he's not just talking about a triple r and either the second half is an important window when china's monetary policy can tilt towards loosening while remaining stable and the interest rates can be lowered in a reasonable and moderate manner let's get the take from also be as well whether daisy i'm david chiu here the short of it is
+so i guess one point if we still haven't gotten that if in the event that we do their take is they it might be a little bit too aggressive to address some of the softness in the economy in other words what they're saying is it needs some help the economy maybe not this much yeah there preferring perhaps perhaps liquidity injections here and there but this might signal a bit too much
+for when it comes to reflating the economy joining us out of the dice all this let's bring in wang tao ubi as head of asia economics and the chief china economists as well wang tao thanks much for joining us first off do you think this is actually a real possibility now
+or well will shrink or fade contro as a frequently called using triple r cut as a tool so i think yes indeed it is a real possibility that they could do this however in the past whenever the state council called for this a few days to a couple of weeks later we were
+would have we would see a triple r cut if they called for it and but it's worth noting that last year in june shoot at the chicago auto quote for it and by the pbc did not hold onto with any market so i i would say at this moment it's probably a relatively high likelihood but anything
+the wording is really you know about mitigating the higher cost of commodity prices they impact on at an ease and make their effective conquered funding a bit lower so it's possible that it's going to be a targeted not a overall triple cut and i i don't think this really reflects a
+wholesale shift in monetary policy i think very very much in the same state concrete statement also talked about

punctuation/vosk-recasepunc-en-0.22/vosk-adapted.txt.punc ADDED Viewed

	@@ -0,0 +1 @@

+ The. The. The Beijing and Shanghai. Welcome to the market strata open. I'm Yvonne, good morning, and I'm David Ingles, counting down, of course, the Diablo trade on the Chinese mainland here in Hong Kong. Let's get your top stories today, taper and a timetable dominating the latest Fed minutes as official debates. The exit path. Meanwhile, I got Beijing heading the other way, hinting at the first triple R cut in more than a year. And after the Didi debacle here, China may move to close a loophole. Long used by companies to take their listings abroad, all to enhance. That was a horrible mistake. Council yesterday from China as a. Maybe it's time to cut the triple R to help them with small businesses they are struggling from the rise of raw material costs. The key question is, how likely is this ? Yeah, what they say it. Chances are likely it's probably going to be up yet. The fact that they're saying it might actually already mean we're getting some sentiment coming through in terms of an improved material tracker. Ten year yield. We'll get to that in just a moment. In China. We're now flirting with the three percent level equity markets futures are pointing up. As you can see here in China, though. Broadly speaking, though, we're down for a seven day across Asia. Seventh day in the last. Excuse me, in the last eight sessions here have little commodity markets. We're stabilising across your oil or oil prices. We're still down five, six per cent from highs, though as far as that is concerned FX markets. Your story is, guys, can we change the police are we're looking at, generally speaking, the dollar. That's very much in focus here. So you look at that against the euro. You look at that against the Chinese currency Twenty four hours ago. Who would have thought we were talking about this sort of more divergence and starker labour discord between where you are in a PBOC to easily in the Fed and very quickly. We alluded to this, Of course, if one three percent on your Chinese ten year yield and we're not one point three percent lower and lower yields, there is a charge for you. China's top US ten year yield is at the bottom. Yeah, the Chinatown area lowest since we saw last year of September. Yup. Yeah, it is a really big major shift in China's central bank policy. That's the key question. Could it be coming ? Of course. Let's flash out that into what we heard from the cabinet there, raising the possibility of a cut to the reserve requirement ratio to both the economy at the same time. We also from a former PBOC official, Sheng Songcheng said the central bank should actually cut rates. He's not just talking about a triple R. And either the second half is an important window when China's monetary policy can tilt towards loosening while remaining stable and the interest rates can be lowered in a reasonable and moderate manner. Let's get the take from also be as well, whether Daisy, I'm David Chiu here, the short of it is so I guess one point, if we still haven't gotten that if in the event that we do their take is they, it might be a little bit too aggressive to address some of the softness in the economy. In other words, what they're saying is it needs some help. The economy, maybe not this much. Yeah, there, preferring perhaps perhaps liquidity injections here and there. But this might signal a bit too much for when it comes to reflating the economy. Joining us out of the dice. All this, Let's bring in Wang Tao Ubi as head of Asia Economics, and the chief China economists as well. Wang Tao, thanks much for joining us. First off, do you think this is actually a real possibility now or well will shrink or fade ? Contro as a frequently called using triple R cut as a tool. So I think yes, indeed, it is a real possibility. That they could do this. However, in the past, whenever the State Council called for this a few days to a couple of weeks later, we were. Would have we would see a triple R cut if they called for it. And. But it's worth noting that last year in June, shoot at the Chicago auto quote for it and by the PBC did not hold onto with any market so I. I would say at this moment it's probably a relatively high likelihood, but anything. The wording is really, you know about mitigating the higher cost of commodity prices they impact on at an ease and make their effective conquered funding a bit lower. So it's possible that it's going to be a targeted, not a overall triple cut and I. I don't think this really reflects a wholesale shift in monetary policy. I think very, very much in the same state. Concrete statement also talked about.

punctuation/vosk-recasepunc-ru-0.22.7z ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f23cc633c06d910e056234d6b83a11a3683e582abc56c30771dbec98a91034de
+size 1639885297

punctuation/vosk-recasepunc-ru-0.22/README ADDED Viewed

	@@ -0,0 +1,7 @@

+1. Install pytorch and transformers:
+    pip3 install transformers
+2. Run python3 example.py de-test.txt
+3. Compare with de-test.txt.orig

punctuation/vosk-recasepunc-ru-0.22/checkpoint ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61fd424795c046963f88534071abde0813a4a6c66c07f0335b013825e536c1ae
+size 2134070889

punctuation/vosk-recasepunc-ru-0.22/example.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import sys
+import time
+from transformers import logging
+from recasepunc import CasePuncPredictor
+from recasepunc import WordpieceTokenizer
+from recasepunc import Config
+logging.set_verbosity_error()
+predictor = CasePuncPredictor('checkpoint', lang="ru")
+text = " ".join(open(sys.argv[1]).readlines())
+tokens = list(enumerate(predictor.tokenize(text)))
+results = ""
+for token, case_label, punc_label in predictor.predict(tokens, lambda x: x[1]):
+    prediction = predictor.map_punc_label(predictor.map_case_label(token[1], case_label), punc_label)
+    if token[1][0] != '#':
+       results = results + ' ' + prediction
+    else:
+       results = results + prediction
+print (results.strip())

punctuation/vosk-recasepunc-ru-0.22/recasepunc.py ADDED Viewed

	@@ -0,0 +1,743 @@

+import sys
+import collections
+import os
+import regex as re
+#from mosestokenizer import *
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import random
+import unicodedata
+import numpy as np
+import argparse
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel, AutoTokenizer, BertTokenizer
+default_config = argparse.Namespace(
+    seed=871253,
+    lang='ru',
+    #flavor='flaubert/flaubert_base_uncased',
+    flavor=None,
+    max_length=256,
+    batch_size=16,
+    updates=50000,
+    period=1000,
+    lr=1e-5,
+    dab_rate=0.1,
+    device='cuda',
+    debug=False
+)
+default_flavors = {
+    'fr': 'flaubert/flaubert_base_uncased',
+    'en': 'bert-base-uncased',
+    'zh': 'ckiplab/bert-base-chinese',
+    'tr': 'dbmdz/bert-base-turkish-uncased',
+    'de': 'dbmdz/bert-base-german-uncased',
+    'pt': 'neuralmind/bert-base-portuguese-cased',
+    'ru': 'DeepPavlov/rubert-base-cased'
+}
+class Config(argparse.Namespace):
+    def __init__(self, **kwargs):
+        for key, value in default_config.__dict__.items():
+            setattr(self, key, value)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        assert self.lang in ['fr', 'en', 'zh', 'tr', 'pt', 'de', 'ru']
+        if 'lang' in kwargs and ('flavor' not in kwargs or kwargs['flavor'] is None):
+            self.flavor = default_flavors[self.lang]
+        #print(self.lang, self.flavor)
+def init_random(seed):
+    # make sure everything is deterministic
+    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+    #torch.use_deterministic_algorithms(True)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+# NOTE: it is assumed in the implementation that y[:,0] is the punctuation label, and y[:,1] is the case label!
+punctuation = {
+    'O': 0,
+    'COMMA': 1,
+    'PERIOD': 2,
+    'QUESTION': 3,
+    'EXCLAMATION': 4,
+}
+punctuation_syms = ['', ',', '.', ' ?', ' !']
+case = {
+    'LOWER': 0,
+    'UPPER': 1,
+    'CAPITALIZE': 2,
+    'OTHER': 3,
+}
+class Model(nn.Module):
+    def __init__(self, flavor, device):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(flavor)
+        # need a proper way of determining representation size
+        size = self.bert.dim if hasattr(self.bert, 'dim') else self.bert.config.pooler_fc_size if hasattr(self.bert.config, 'pooler_fc_size') else self.bert.config.emb_dim if hasattr(self.bert.config, 'emb_dim') else self.bert.config.hidden_size
+        self.punc = nn.Linear(size, 5)
+        self.case = nn.Linear(size, 4)
+        self.dropout = nn.Dropout(0.3)
+        self.to(device)
+    def forward(self, x):
+        output = self.bert(x)
+        representations = self.dropout(F.gelu(output['last_hidden_state']))
+        punc = self.punc(representations)
+        case = self.case(representations)
+        return punc, case
+# randomly create sequences that align to punctuation boundaries
+def drop_at_boundaries(rate, x, y, cls_token_id, sep_token_id, pad_token_id):
+    for i, dropped in enumerate(torch.rand((len(x),)) < rate):
+        if dropped:
+            # select all indices that are sentence endings
+            indices = (y[i,:,0] > 1).nonzero(as_tuple=True)[0]
+            if len(indices) < 2:
+                continue
+            start = indices[0] + 1
+            end = indices[random.randint(1, len(indices) - 1)] + 1
+            length = end - start
+            if length + 2 > len(x[i]):
+                continue
+            x[i, 0] = cls_token_id
+            x[i, 1: length + 1] = x[i, start: end].clone()
+            x[i, length + 1] = sep_token_id
+            x[i, length + 2:] = pad_token_id
+            y[i, 0] = 0
+            y[i, 1: length + 1] = y[i, start: end].clone()
+            y[i, length + 1:] = 0
+def compute_performance(config, model, loader):
+    device = config.device
+    criterion = nn.CrossEntropyLoss()
+    model.eval()
+    total_loss = all_correct1 = all_correct2 = num_loss = num_perf = 0
+    num_ref = collections.defaultdict(float)
+    num_hyp = collections.defaultdict(float)
+    num_correct = collections.defaultdict(float)
+    for x, y in loader:
+        x = x.long().to(device)
+        y = y.long().to(device)
+        y1 = y[:,:,0]
+        y2 = y[:,:,1]
+        with torch.no_grad():
+            y_scores1, y_scores2 = model(x.to(device))
+            loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
+            loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
+            loss = loss1 + loss2
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for label in range(1, 5):
+                ref = (y1 == label)
+                hyp = (y_pred1 == label)
+                correct = (ref * hyp == 1)
+                num_ref[label] += ref.sum()
+                num_hyp[label] += hyp.sum()
+                num_correct[label] += correct.sum()
+                num_ref[0] += ref.sum()
+                num_hyp[0] += hyp.sum()
+                num_correct[0] += correct.sum()
+            all_correct1 += (y_pred1 == y1).sum()
+            all_correct2 += (y_pred2 == y2).sum()
+            total_loss += loss.item()
+            num_loss += len(y)
+            num_perf += len(y) * config.max_length
+    recall = {}
+    precision = {}
+    fscore = {}
+    for label in range(0, 5):
+        recall[label] = num_correct[label] / num_ref[label] if num_ref[label] > 0 else 0
+        precision[label] = num_correct[label] / num_hyp[label] if num_hyp[label] > 0 else 0
+        fscore[label] = (2 * recall[label] * precision[label] / (recall[label] + precision[label])).item() if recall[label] + precision[label] > 0 else 0
+    return total_loss / num_loss, all_correct2.item() / num_perf, all_correct1.item() / num_perf, fscore
+def fit(config, model, checkpoint_path, train_loader, valid_loader, iterations, valid_period=200, lr=1e-5):
+    device = config.device
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=lr)
+    iteration = 0
+    while True:
+        model.train()
+        total_loss = num = 0
+        for x, y in tqdm(train_loader):
+            x = x.long().to(device)
+            y = y.long().to(device)
+            drop_at_boundaries(config.dab_rate, x, y, config.cls_token_id, config.sep_token_id, config.pad_token_id)
+            y1 = y[:,:,0]
+            y2 = y[:,:,1]
+            optimizer.zero_grad()
+            y_scores1, y_scores2 = model(x)
+            loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
+            loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
+            loss = loss1 + loss2
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            num += len(y)
+            if iteration % valid_period == valid_period - 1:
+                train_loss = total_loss / num
+                valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore = compute_performance(config, model, valid_loader)
+                torch.save({
+                    'iteration': iteration + 1,
+                    'model_state_dict': model.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'train_loss': train_loss,
+                    'valid_loss': valid_loss,
+                    'valid_accuracy_case': valid_accuracy_case,
+                    'valid_accuracy_punc': valid_accuracy_punc,
+                    'valid_fscore': valid_fscore,
+                    'config': config.__dict__,
+                }, '%s.%d' % (checkpoint_path, iteration + 1))
+                print(iteration + 1, train_loss, valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore)
+                total_loss = num = 0
+            iteration += 1
+            if iteration > iterations:
+                return
+            sys.stderr.flush()
+            sys.stdout.flush()
+def batchify(max_length, x, y):
+    print (x.shape)
+    print (y.shape)
+    x = x[:(len(x) // max_length) * max_length].reshape(-1, max_length)
+    y = y[:(len(y) // max_length) * max_length, :].reshape(-1, max_length, 2)
+    return x, y
+def train(config, train_x_fn, train_y_fn, valid_x_fn, valid_y_fn, checkpoint_path):
+    X_train, Y_train = batchify(config.max_length, torch.load(train_x_fn), torch.load(train_y_fn))
+    X_valid, Y_valid = batchify(config.max_length, torch.load(valid_x_fn), torch.load(valid_y_fn))
+    train_set = TensorDataset(X_train, Y_train)
+    valid_set = TensorDataset(X_valid, Y_valid)
+    train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
+    valid_loader = DataLoader(valid_set, batch_size=config.batch_size)
+    model = Model(config.flavor, config.device)
+    fit(config, model, checkpoint_path, train_loader, valid_loader, config.updates, config.period, config.lr)
+def run_eval(config, test_x_fn, test_y_fn, checkpoint_path):
+    X_test, Y_test = batchify(config.max_length, torch.load(test_x_fn), torch.load(test_y_fn))
+    test_set = TensorDataset(X_test, Y_test)
+    test_loader = DataLoader(test_set, batch_size=config.batch_size)
+    loaded = torch.load(checkpoint_path, map_location=config.device)
+    if 'config' in loaded:
+        config = Config(**loaded['config'])
+        init(config)
+    model = Model(config.flavor, config.device)
+    model.load_state_dict(loaded['model_state_dict'])
+    print(*compute_performance(config, model, test_loader))
+def recase(token, label):
+    if label == case['LOWER']:
+        return token.lower()
+    elif label == case['CAPITALIZE']:
+        return token.lower().capitalize()
+    elif label == case['UPPER']:
+        return token.upper()
+    else:
+        return token
+class CasePuncPredictor:
+    def __init__(self, checkpoint_path, lang=default_config.lang, flavor=default_config.flavor, device=default_config.device):
+        loaded = torch.load(checkpoint_path, map_location=device if torch.cuda.is_available() else 'cpu')
+        if 'config' in loaded:
+            self.config = Config(**loaded['config'])
+        else:
+            self.config = Config(lang=lang, flavor=flavor, device=device)
+        init(self.config)
+        self.model = Model(self.config.flavor, self.config.device)
+        self.model.load_state_dict(loaded['model_state_dict'])
+        self.model.eval()
+        self.model.to(self.config.device)
+        self.rev_case = {b: a for a, b in case.items()}
+        self.rev_punc = {b: a for a, b in punctuation.items()}
+    def tokenize(self, text):
+        return [self.config.cls_token] + self.config.tokenizer.tokenize(text) + [self.config.sep_token]
+    def predict(self, tokens, getter=lambda x: x):
+        max_length = self.config.max_length
+        device = self.config.device
+        if type(tokens) == str:
+            tokens = self.tokenize(tokens)
+        previous_label = punctuation['PERIOD']
+        for start in range(0, len(tokens), max_length):
+            instance = tokens[start: start + max_length]
+            if type(getter(instance[0])) == str:
+                ids = self.config.tokenizer.convert_tokens_to_ids(getter(token) for token in instance)
+            else:
+                ids = [getter(token) for token in instance]
+            if len(ids) < max_length:
+                ids += [0] * (max_length - len(ids))
+            x = torch.tensor([ids]).long().to(device)
+            y_scores1, y_scores2 = self.model(x)
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for i, id, token, punc_label, case_label in zip(range(len(instance)), ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
+                if id == self.config.cls_token_id or id == self.config.sep_token_id:
+                    continue
+                if previous_label != None and previous_label > 1:
+                    if case_label in [case['LOWER'], case['OTHER']]: # LOWER, OTHER
+                        case_label = case['CAPITALIZE']
+                if i + start == len(tokens) - 2 and punc_label == punctuation['O']:
+                    punc_label = punctuation['PERIOD']
+                yield (token, self.rev_case[case_label], self.rev_punc[punc_label])
+                previous_label = punc_label
+    def map_case_label(self, token, case_label):
+        if token.endswith('</w>'):
+            token = token[:-4]
+        if token.startswith('##'):
+            token = token[2:]
+        return recase(token, case[case_label])
+    def map_punc_label(self, token, punc_label):
+        if token.endswith('</w>'):
+            token = token[:-4]
+        if token.startswith('##'):
+            token = token[2:]
+        return token + punctuation_syms[punctuation[punc_label]]
+def generate_predictions(config, checkpoint_path):
+    loaded = torch.load(checkpoint_path, map_location=config.device if torch.cuda.is_available() else 'cpu')
+    if 'config' in loaded:
+        config = Config(**loaded['config'])
+        init(config)
+    model = Model(config.flavor, config.device)
+    model.load_state_dict(loaded['model_state_dict'])
+    rev_case = {b: a for a, b in case.items()}
+    rev_punc = {b: a for a, b in punctuation.items()}
+    for line in sys.stdin:
+        # also drop punctuation that we may generate
+        line = ''.join([c for c in line if c not in mapped_punctuation])
+        if config.debug:
+            print(line)
+        tokens = [config.cls_token] + config.tokenizer.tokenize(line) + [config.sep_token]
+        if config.debug:
+            print(tokens)
+        previous_label = punctuation['PERIOD']
+        first_time = True
+        was_word = False
+        for start in range(0, len(tokens), config.max_length):
+            instance = tokens[start: start + config.max_length]
+            ids = config.tokenizer.convert_tokens_to_ids(instance)
+            #print(len(ids), file=sys.stderr)
+            if len(ids) < config.max_length:
+                ids += [config.pad_token_id] * (config.max_length - len(ids))
+            x = torch.tensor([ids]).long().to(config.device)
+            y_scores1, y_scores2 = model(x)
+            y_pred1 = torch.max(y_scores1, 2)[1]
+            y_pred2 = torch.max(y_scores2, 2)[1]
+            for id, token, punc_label, case_label in zip(ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
+                if config.debug:
+                    print(id, token, punc_label, case_label, file=sys.stderr)
+                if id == config.cls_token_id or id == config.sep_token_id:
+                    continue
+                if previous_label != None and previous_label > 1:
+                    if case_label in [case['LOWER'], case['OTHER']]:
+                        case_label = case['CAPITALIZE']
+                previous_label = punc_label
+                # different strategy due to sub-lexical token encoding in Flaubert
+                if config.lang == 'fr':
+                    if token.endswith('</w>'):
+                        cased_token = recase(token[:-4], case_label)
+                        if was_word:
+                            print(' ', end='')
+                        print(cased_token + punctuation_syms[punc_label], end='')
+                        was_word = True
+                    else:
+                        cased_token = recase(token, case_label)
+                        if was_word:
+                            print(' ', end='')
+                        print(cased_token, end='')
+                        was_word = False
+                else:
+                    if token.startswith('##'):
+                        cased_token = recase(token[2:], case_label)
+                        print(cased_token, end='')
+                    else:
+                        cased_token = recase(token, case_label)
+                        if not first_time:
+                            print(' ', end='')
+                        first_time = False
+                        print(cased_token + punctuation_syms[punc_label], end='')
+        if previous_label == 0:
+            print('.', end='')
+        print()
+def label_for_case(token):
+    token = re.sub('[^\p{Han}\p{Ll}\p{Lu}]', '', token)
+    if token == token.lower():
+        return 'LOWER'
+    elif token == token.lower().capitalize():
+        return 'CAPITALIZE'
+    elif token == token.upper():
+        return 'UPPER'
+    else:
+        return 'OTHER'
+def make_tensors(config, input_fn, output_x_fn, output_y_fn):
+    # count file lines without loading them
+    size = 0
+    with open(input_fn) as fp:
+        for line in fp:
+            size += 1
+    with open(input_fn) as fp:
+        X = torch.IntTensor(size)
+        Y = torch.ByteTensor(size, 2)
+        offset = 0
+        for n, line in enumerate(fp):
+            word, case_label, punc_label = line.strip().split('\t')
+            id = config.tokenizer.convert_tokens_to_ids(word)
+            if config.debug:
+                assert word.lower() == tokenizer.convert_ids_to_tokens(id)
+            X[offset] = id
+            Y[offset, 0] = punctuation[punc_label]
+            Y[offset, 1] = case[case_label]
+            offset += 1
+        torch.save(X, output_x_fn)
+        torch.save(Y, output_y_fn)
+mapped_punctuation = {
+    '.': 'PERIOD',
+    '...': 'PERIOD',
+    ',': 'COMMA',
+    ';': 'COMMA',
+    ':': 'COMMA',
+    '(': 'COMMA',
+    ')': 'COMMA',
+    '?': 'QUESTION',
+    '!': 'EXCLAMATION',
+    '，': 'COMMA',
+    '！': 'EXCLAMATION',
+    '？': 'QUESTION',
+    '；': 'COMMA',
+    '：': 'COMMA',
+    '（': 'COMMA',
+    '(': 'COMMA',
+    '）': 'COMMA',
+    '［': 'COMMA',
+    '］': 'COMMA',
+    '【': 'COMMA',
+    '】': 'COMMA',
+    '└': 'COMMA',
+    '└ ': 'COMMA',
+    '_': 'O',
+    '。': 'PERIOD',
+    '、': 'COMMA', # enumeration comma
+    '、': 'COMMA',
+    '…': 'PERIOD',
+    '—': 'COMMA',
+    '「': 'COMMA',
+    '」': 'COMMA',
+    '．': 'PERIOD',
+    '《': 'O',
+    '》': 'O',
+    '，': 'COMMA',
+    '“': 'O',
+    '”': 'O',
+    '"': 'O',
+    '-': 'O',
+    '-': 'O',
+    '〉': 'COMMA',
+    '〈': 'COMMA',
+    '↑': 'O',
+    '〔': 'COMMA',
+    '〕': 'COMMA',
+}
+def preprocess_text(config, max_token_count=-1):
+    global num_tokens_output
+    max_token_count = int(max_token_count)
+    num_tokens_output = 0
+    def process_segment(text, punctuation):
+        global num_tokens_output
+        text = text.replace('\t', ' ')
+        tokens = config.tokenizer.tokenize(text)
+        for i, token in enumerate(tokens):
+            case_label = label_for_case(token)
+            if i == len(tokens) - 1:
+                print(token.lower(), case_label, punctuation, sep='\t')
+            else:
+                print(token.lower(), case_label, 'O', sep='\t')
+            num_tokens_output += 1
+            # a bit too ugly, but alternative is to throw an exception
+            if max_token_count > 0 and num_tokens_output >= max_token_count:
+                sys.exit(0)
+    for line in sys.stdin:
+        line = line.strip()
+        if line != '':
+            line = unicodedata.normalize("NFC", line)
+            if config.debug:
+                print(line)
+            start = 0
+            for i, char in enumerate(line):
+                if char in mapped_punctuation:
+                    if i > start and line[start: i].strip() != '':
+                        process_segment(line[start: i], mapped_punctuation[char])
+                    start = i + 1
+            if start < len(line):
+                process_segment(line[start:], 'PERIOD')
+def preprocess_text_old_fr(config):
+    assert config.lang == 'fr'
+    splitsents = MosesSentenceSplitter(lang)
+    tokenize = MosesTokenizer(lang, extra=['-no-escape'])
+    normalize = MosesPunctuationNormalizer(lang)
+    for line in sys.stdin:
+        if line.strip() != '':
+            for sentence in splitsents([normalize(line)]):
+                tokens = tokenize(sentence)
+                previous_token = None
+                for token in tokens:
+                    if token in mapped_punctuation:
+                        if previous_token != None:
+                            print(previous_token, mapped_punctuation[token], sep='\t')
+                        previous_token = None
+                    elif not re.search('[\p{Han}\p{Ll}\p{Lu}\d]', token): # remove non-alphanumeric tokens
+                        continue
+                    else:
+                        if previous_token != None:
+                            print(previous_token, 'O', sep='\t')
+                        previous_token = token
+                if previous_token != None:
+                    print(previous_token, 'PERIOD', sep='\t')
+# modification of the wordpiece tokenizer to keep case information even if vocab is lower cased
+# forked from https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100, keep_case=True):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+        self.keep_case = keep_case
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in text.strip().split():
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    # optionaly lowercase substring before checking for inclusion in vocab
+                    if (self.keep_case and substr.lower() in self.vocab) or (substr in self.vocab):
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+# modification of XLM bpe tokenizer for keeping case information when vocab is lowercase
+# forked from https://github.com/huggingface/transformers/blob/cd56f3fe7eae4a53a9880e3f5e8f91877a78271c/src/transformers/models/xlm/tokenization_xlm.py
+def bpe(self, token):
+    def to_lower(pair):
+      #print('  ',pair)
+      return (pair[0].lower(), pair[1].lower())
+    from transformers.models.xlm.tokenization_xlm import get_pairs
+    word = tuple(token[:-1]) + (token[-1] + "</w>",)
+    if token in self.cache:
+        return self.cache[token]
+    pairs = get_pairs(word)
+    if not pairs:
+        return token + "</w>"
+    while True:
+        bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(to_lower(pair), float("inf")))
+        #print(bigram)
+        if to_lower(bigram) not in self.bpe_ranks:
+            break
+        first, second = bigram
+        new_word = []
+        i = 0
+        while i < len(word):
+            try:
+                j = word.index(first, i)
+            except ValueError:
+                new_word.extend(word[i:])
+                break
+            else:
+                new_word.extend(word[i:j])
+                i = j
+            if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                new_word.append(first + second)
+                i += 2
+            else:
+                new_word.append(word[i])
+                i += 1
+        new_word = tuple(new_word)
+        word = new_word
+        if len(word) == 1:
+            break
+        else:
+            pairs = get_pairs(word)
+    word = " ".join(word)
+    if word == "\n  </w>":
+        word = "\n</w>"
+    self.cache[token] = word
+    return word
+def init(config):
+    init_random(config.seed)
+    if config.lang == 'fr':
+        config.tokenizer = tokenizer = AutoTokenizer.from_pretrained(config.flavor, do_lower_case=False)
+        from transformers.models.xlm.tokenization_xlm import XLMTokenizer
+        assert isinstance(tokenizer, XLMTokenizer)
+        # monkey patch XLM tokenizer
+        import types
+        tokenizer.bpe = types.MethodType(bpe, tokenizer)
+    else:
+        # warning: needs to be BertTokenizer for monkey patching to work
+        config.tokenizer = tokenizer = BertTokenizer.from_pretrained(config.flavor, do_lower_case=False)
+        # warning: monkey patch tokenizer to keep case information
+        #from recasing_tokenizer import WordpieceTokenizer
+        config.tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token)
+    if config.lang == 'fr':
+        config.pad_token_id = tokenizer.pad_token_id
+        config.cls_token_id = tokenizer.bos_token_id
+        config.cls_token = tokenizer.bos_token
+        config.sep_token_id = tokenizer.sep_token_id
+        config.sep_token = tokenizer.sep_token
+    else:
+        config.pad_token_id = tokenizer.pad_token_id
+        config.cls_token_id = tokenizer.cls_token_id
+        config.cls_token = tokenizer.cls_token
+        config.sep_token_id = tokenizer.sep_token_id
+        config.sep_token = tokenizer.sep_token
+    if not torch.cuda.is_available() and config.device == 'cuda':
+        print('WARNING: reverting to cpu as cuda is not available', file=sys.stderr)
+    config.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')
+def main(config, action, args):
+    init(config)
+    if action == 'train':
+        train(config, *args)
+    elif action == 'eval':
+        run_eval(config, *args)
+    elif action == 'predict':
+        generate_predictions(config, *args)
+    elif action == 'tensorize':
+        make_tensors(config, *args)
+    elif action == 'preprocess':
+        preprocess_text(config, *args)
+    else:
+        print('invalid action "%s"' % action)
+        sys.exit(1)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("action", help="train|eval|predict|tensorize|preprocess", type=str)
+    parser.add_argument("action_args", help="arguments for selected action", type=str, nargs='*')
+    parser.add_argument("--seed", help="random seed", default=default_config.seed, type=int)
+    parser.add_argument("--lang", help="language (fr, en, zh)", default=default_config.lang, type=str)
+    parser.add_argument("--flavor", help="bert flavor in transformers model zoo", default=default_config.flavor, type=str)
+    parser.add_argument("--max-length", help="maximum input length", default=default_config.max_length, type=int)
+    parser.add_argument("--batch-size", help="size of batches", default=default_config.batch_size, type=int)
+    parser.add_argument("--device", help="computation device (cuda, cpu)", default=default_config.device, type=str)
+    parser.add_argument("--debug", help="whether to output more debug info", default=default_config.debug, type=bool)
+    parser.add_argument("--updates", help="number of training updates to perform", default=default_config.updates, type=bool)
+    parser.add_argument("--period", help="validation period in updates", default=default_config.period, type=bool)
+    parser.add_argument("--lr", help="learning rate", default=default_config.lr, type=bool)
+    parser.add_argument("--dab-rate", help="drop at boundaries rate", default=default_config.dab_rate, type=bool)
+    config = Config(**parser.parse_args().__dict__)
+    main(config, config.action, config.action_args)

punctuation/vosk-recasepunc-ru-0.22/ru-test.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+все смешалось в доме облонских жена узнала что муж был в связи с бывшею
+в их доме француженкою-гувернанткой и объявила мужу что не может жить с
+ним в одном доме положение это продолжалось уже третий день и мучительно
+чувствовалось и самими супругами и всеми членами семьи и домочадцами
+все члены семьи и домочадцы чувствовали что нет смысла в их сожительстве
+и что на каждом постоялом дворе случайно сошедшиеся люди более связаны
+между собой чем они члены семьи и домочадцы облонских жена не выходила
+из своих комнат мужа третий день не было дома дети бегали по всему
+дому как потерянные англичанка поссорилась с экономкой и написала
+записку приятельнице прося приискать ей новое место повар ушел еще
+вчера со двора во время обеда черная кухарка и кучер просили расчета
+На третий день после ссоры князь степан аркадьич облонский стива как
+его звали в свете в обычный час то есть в восемь часов утра
+проснулся не в спальне жены а в своем кабинете на сафьянном диване
+он повернул свое полное выхоленное тело на пружинах дивана как бы желая
+опять заснуть надолго с другой стороны крепко обнял подушку и прижался к
+ней щекой но вдруг вскочил сел на диван и открыл глаза

punctuation/vosk-recasepunc-ru-0.22/ru-test.txt.orig ADDED Viewed

	@@ -0,0 +1,17 @@

+Все смешалось в доме Облонских. Жена узнала, что муж был в связи с бывшею
+в их доме француженкою-гувернанткой, и объявила мужу, что не может жить с
+ним в одном доме. Положение это продолжалось уже третий день и мучительно
+чувствовалось и самими супругами, и всеми членами семьи, и домочадцами.
+Все члены семьи и домочадцы чувствовали, что нет смысла в их сожительстве
+и что на каждом постоялом дворе случайно сошедшиеся люди более связаны
+между собой, чем они, члены семьи и домочадцы Облонских. Жена не выходила
+из своих комнат, мужа третий день не было дома. Дети бегали по всему
+дому, как потерянные; англичанка поссорилась с экономкой и написала
+записку приятельнице, прося приискать ей новое место; повар ушел еще
+вчера со двора, во время обеда; черная кухарка и кучер просили расчета.
+На третий день после ссоры князь Степан Аркадьич Облонский -- Стива, как
+его звали в свете, -- в обычный час, то есть в восемь часов утра,
+проснулся не в спальне жены, а в своем кабинете, на сафьянном диване...
+Он повернул свое полное, выхоленное тело на пружинах дивана, как бы желая
+опять заснуть надолго, с другой стороны крепко обнял подушку и прижался к
+ней щекой; но вдруг вскочил, сел на диван и открыл глаза.

speaker_indentification/vosk-model-spk-0.4.7z ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4c7ccb7760ffc2ccb780beac3ba40907728c075fbb1cbb66b4dacc0afda4598
+size 13785421

speaker_indentification/vosk-model-spk-0.4/README.txt ADDED Viewed

	@@ -0,0 +1,119 @@

+ UPLOADER        David Snyder
+ DATE            2018-05-30
+ KALDI VERSION   108832d
+ This directory contains files generated from the recipe in
+ egs/callhome_diarization/v2/.  It's contents should be placed in a similar
+ directory, with symbolic links to diarization/, sid/, steps/, etc.  This was
+ created when Kaldi's master branch was at git log
+ 2ad8d7821867a199e435aa36bbd13af6ed937c94.
+ I. Files list
+ ------------------------------------------------------------------------------
+ ./
+     README.txt               This file
+     run.sh                   A copy of the egs/callhome_diarization/v2/run.sh
+                              at the time of uploading this file.  Use this to
+                              figure out how to compute features, extract
+                              embeddings, etc.
+ local/nnet3/xvector/tuning/
+     run_xvector_1a.sh        This is the default recipe, at the time of
+                              uploading this resource.  The script generates
+                              the configs, egs, and trains the model.
+ conf/
+     vad.conf                 The energy-based VAD configuration
+     mfcc.conf                MFCC configuration
+ exp/xvector_nnet_1a/
+     final.raw                The pretrained DNN model
+     nnet.config              The nnet3 config file that was used when the
+                              DNN model was first instantiated.
+     extract.config           Another nnet3 config file that modifies the DNN
+                              final.raw to extract x-vectors.  It should be
+                              automatically handled by the script
+                              extract_xvectors.sh.
+     min_chunk_size           Min chunk size used (see extract_xvectors.sh)
+     max_chunk_size           Max chunk size used (see extract_xvectors.sh)
+     srand                    The RNG seed used when creating the DNN
+ exp/xvectors_callhome1/
+     mean.vec                 Vector for centering, from callhome1
+     transform.mat            Whitening matrix, trained on callhome1
+     plda                     PLDA model for callhome1, trained on SRE data
+ exp/xvectors_callhome2/
+     mean.vec                 Vector for centering, from callhome2
+     transform.mat            Whitening matrix, trained on callhome2
+     plda                     PLDA model for callhome1, trained on SRE data
+ II. Citation
+ ------------------------------------------------------------------------------
+ If you wish to use this architecture in a publication, please cite one of the
+ following papers.
+ The x-vector architecture:
+ @inproceedings{snyder2018xvector,
+ title={X-vectors: Robust DNN Embeddings for Speaker Recognition},
+ author={Snyder, D. and Garcia-Romero, D. and Sell, G. and Povey, D. and Khudanpur, S.},
+ booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+ year={2018},
+ organization={IEEE},
+ url={http://www.danielpovey.com/files/2018_icassp_xvectors.pdf}
+ }
+ Diarization with x-vectors:
+ @article{sell2018dihard,
+  title={Diarization is Hard: Some Experiences and Lessons Learned for the JHU Team in the Inaugural DIHARD Challenge},
+  author={Sell, G. and Snyder, D. and McCree, A. and Garcia-Romero, D. and Villalba, J. and Maciejewski, M. and Manohar, V. and Dehak, N. and Povey, D. and Watanabe, S. and Khudanpur, J.},
+  journal={Interspeech},
+  year={2018}
+ }
+ III. Recipe README.txt
+ ------------------------------------------------------------------------------
+ The following text is the README.txt from egs/callhome_diarization/v2 at the
+ time this archive was created.
+ This recipe replaces i-vectors used in the v1 recipe with embeddings extracted
+ from a deep neural network.  In the scripts, we refer to these embeddings as
+ "x-vectors."  The x-vector recipe in
+ local/nnet3/xvector/tuning/run_xvector_1a.sh is closesly based on the
+ following paper:
+ However, in this example, the x-vectors are used for diarization, rather
+ than speaker recognition.  Diarization is performed by splitting speech
+ segments into very short segments (e.g., 1.5 seconds), extracting embeddings
+ from the segments, and clustering them to obtain speaker labels.
+ The recipe uses the following data for system development.  This is in
+ addition to the NIST SRE 2000 dataset (Callhome) which is used for
+ evaluation (see ../README.txt).
+     Corpus              LDC Catalog No.
+     SRE2004             LDC2006S44
+     SRE2005 Train       LDC2011S01
+     SRE2005 Test        LDC2011S04
+     SRE2006 Train       LDC2011S09
+     SRE2006 Test 1      LDC2011S10
+     SRE2006 Test 2      LDC2012S01
+     SRE2008 Train       LDC2011S05
+     SRE2008 Test        LDC2011S08
+     SWBD2 Phase 2       LDC99S79
+     SWBD2 Phase 3       LDC2002S06
+     SWBD Cellular 1     LDC2001S13
+     SWBD Cellular 2     LDC2004S07
+ The following datasets are used in data augmentation.
+     MUSAN               http://www.openslr.org/17
+     RIR_NOISES          http://www.openslr.org/28

speaker_indentification/vosk-model-spk-0.4/final.ext.raw ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d30354474ac719b3cfce58186463302b988cf22ce3afee7237a2da862c4a91d1
+size 14929171

speaker_indentification/vosk-model-spk-0.4/mean.vec ADDED Viewed

	@@ -0,0 +1 @@

+ [ 4.450152 4.672029 4.148891 -1.711527 3.509846 2.931994 2.850384 3.178227 0.2563171 -0.9261234 3.37196 0.1472566 5.635284 -0.01870821 1.972103 -0.9502754 4.401544 2.795261 2.67637 3.917823 0.6549923 -0.02103148 4.064806 4.100016 3.700118 1.252804 5.399523 4.084152 4.106742 3.5622 4.165306 -0.2494654 -0.9603948 4.272289 -2.332889 -0.7292819 3.646834 0.3090337 4.624666 5.089351 -5.635771 1.634198 1.089098 4.363739 3.618721 0.2134228 -0.3965465 5.353687 4.034757 4.032773 3.749556 3.166129 3.868708 4.381798 -0.02561651 0.3426051 4.402168 0.1237091 0.8197291 3.809948 -2.995811 -1.648535 3.202967 3.239381 3.250949 -0.9064079 4.452719 0.2775586 0.80832 3.036884 5.163679 0.4273587 3.537773 2.539269 3.151272 4.064805 3.56104 4.244997 3.660802 4.949434 4.013721 1.418729 1.845101 4.74059 3.280786 -1.731479 1.492544 -2.88268 5.013491 5.327713 -2.668042 1.02902 -0.9622369 3.954224 3.2533 3.348548 2.906777 -0.3059559 4.595854 0.3410174 2.116138 4.830284 3.402886 3.014466 4.481457 5.14358 2.05649 3.883894 -0.9075359 4.574888 4.064843 -1.416883 3.493051 -0.06792944 4.978102 4.930044 4.138368 2.826191 4.031521 2.575887 0.7125556 4.15551 2.601444 1.190357 -1.060124 0.9739355 4.671662 -1.613742 ]

speaker_indentification/vosk-model-spk-0.4/mfcc.conf ADDED Viewed

	@@ -0,0 +1,5 @@

+--sample-frequency=8000
+--high-freq=3700
+--low-freq=20
+--num-ceps=23
+--allow-downsample=true

speaker_indentification/vosk-model-spk-0.4/transform.mat ADDED Viewed

Binary file (65.6 kB). View file

tts/vosk-model-tts-ru-0.9-multi.7z ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e0f1ce5406abbabf29b823a11a2d937e70a1abeeb5d96c5fb518edc0cd4b949
+size 761220882

tts/vosk-model-tts-ru-0.9-multi/README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+Russian Vosk TTS model
+Version 0.9
+Metrics:
+CER                0.6
+FAD                0.810
+UTMOS              3.290
+Speaker Similarity 0.875
+xRT CPU            0.35
+xRT GPU            0.06
+License: Apache 2.0
+Changelog:
+ * ASR alignment
+ * No encoder, just duration predictor
+ * Slightly thinner predictor width (160) to fit DiT hidden vector
+ * Scale for diffusion loss (to not dominate on duration loss)

tts/vosk-model-tts-ru-0.9-multi/bert/README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+language:
+- ru
+tags:
+- PyTorch
+- Transformers
+- bert
+- exbert
+pipeline_tag: fill-mask
+thumbnail: "https://github.com/sberbank-ai/model-zoo"
+license: apache-2.0
+---
+# ruBert-base
+The model architecture design, pretraining, and evaluation are documented in our preprint: [**A Family of Pretrained Transformer Language Models for Russian**](https://arxiv.org/abs/2309.10931).
+The model is pretrained by the [SberDevices](https://sberdevices.ru/) team.
+* Task: `mask filling`
+* Type: `encoder`
+* Tokenizer: `BPE`
+* Dict size: `120 138`
+* Num Parameters: `178 M`
+* Training Data Volume `30 GB`
+# Authors
++ NLP core team RnD [Telegram channel](https://t.me/nlpcoreteam):
+  + Dmitry Zmitrovich
+# Cite us
+```
+@misc{zmitrovich2023family,
+      title={A Family of Pretrained Transformer Language Models for Russian},
+      author={Dmitry Zmitrovich and Alexander Abramov and Andrey Kalmykov and Maria Tikhonova and Ekaterina Taktasheva and Danil Astafurov and Mark Baushenko and Artem Snegirev and Tatiana Shavrina and Sergey Markov and Vladislav Mikhailov and Alena Fenogenova},
+      year={2023},
+      eprint={2309.10931},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```

tts/vosk-model-tts-ru-0.9-multi/bert/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e2f1740eaae5e29c2b4844625cbb01ff644b2b5fb0560bd34374c35d8a092c1
+size 654361598

tts/vosk-model-tts-ru-0.9-multi/bert/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tts/vosk-model-tts-ru-0.9-multi/config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+    "audio": {
+        "sample_rate": 22050
+    },
+    "inference": {
+        "noise_level": 0.8,
+        "speech_rate": 1,
+        "duration_noise_level": 0.8
+    },
+    "phoneme_id_map": {
+        "_": 0,
+        "^": 1,
+        "$": 2,
+        " ": 3,
+        "!": 4,
+        "'": 5,
+        "(": 6,
+        ")": 7,
+        ",": 8,
+        "-": 9,
+        ".": 10,
+        "...": 11,
+        ":": 12,
+        ";": 13,
+        "?": 14,
+        "a0": 15,
+        "a1": 16,
+        "b": 17,
+        "bj": 18,
+        "c": 19,
+        "ch": 20,
+        "d": 21,
+        "dj": 22,
+        "e0": 23,
+        "e1": 24,
+        "f": 25,
+        "fj": 26,
+        "g": 27,
+        "gj": 28,
+        "h": 29,
+        "hj": 30,
+        "i0": 31,
+        "i1": 32,
+        "j": 33,
+        "k": 34,
+        "kj": 35,
+        "l": 36,
+        "lj": 37,
+        "m": 38,
+        "mj": 39,
+        "n": 40,
+        "nj": 41,
+        "o0": 42,
+        "o1": 43,
+        "p": 44,
+        "pj": 45,
+        "r": 46,
+        "rj": 47,
+        "s": 48,
+        "sch": 49,
+        "sh": 50,
+        "sj": 51,
+        "t": 52,
+        "tj": 53,
+        "u0": 54,
+        "u1": 55,
+        "v": 56,
+        "vj": 57,
+        "y0": 58,
+        "y1": 59,
+        "z": 60,
+        "zh": 61,
+        "zj": 62
+    },
+    "num_symbols": 62,
+    "num_speakers": 5,
+    "speaker_id_map": {
+        "female_0": 0,
+        "female_1": 1,
+        "female_2": 2,
+        "male_0": 3,
+        "male_1": 4
+    },
+    "model_type": "multistream_v1"
+}

tts/vosk-model-tts-ru-0.9-multi/dictionary ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2939e72c170bb41ac8e256828cca1c5fac4db1e36717f9f53fde843b00a220ba
+size 101431118

tts/vosk-model-tts-ru-0.9-multi/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fa5a36b22a8bf7fe7179a3882c6371d2c01e5317019e717516f892d329c24b9
+size 179314533