|
import json
|
|
import re
|
|
import csv
|
|
import shutil
|
|
import os
|
|
import sys
|
|
|
|
main_path = os.getcwd()
|
|
|
|
def prepare_data_for_model(path):
|
|
f = open(path, 'r')
|
|
data = csv.DictReader(f)
|
|
data_lines = []
|
|
for row in data:
|
|
phoneme = row['phenome']
|
|
utterance_name = row['seg_id']
|
|
speake_id = row['speaker_id']
|
|
phoneme = re.sub("\[([0-9]+)\]", '', phoneme)
|
|
phoneme = re.sub("\s+\|\s+", ' ', phoneme)
|
|
data_lines.append([phoneme, utterance_name, speake_id])
|
|
f.close()
|
|
return data_lines
|
|
|
|
|
|
def save_files(train_data, test_data, data_path):
|
|
for line in train_data:
|
|
try:
|
|
original = os.path.join(data_path, 'train_wav/{}.wav'.format(line[1]))
|
|
target = os.path.join(main_path, 'dataset/persian_data/train_data/book-1/speaker-{0}/utterance-{1}.wav'.format(line[2], line[1]))
|
|
os.makedirs(os.path.dirname(target), exist_ok=True)
|
|
shutil.copyfile(original, target)
|
|
except Exception as e:
|
|
print(e)
|
|
return False
|
|
|
|
path = os.path.join(main_path, 'dataset/persian_data/train_data/book-1/speaker-{0}/utterance-{1}.txt'.format(line[2], line[1]))
|
|
with open(path, 'w') as fp:
|
|
fp.write(line[0])
|
|
|
|
for line in test_data:
|
|
try:
|
|
original = os.path.join(data_path, 'test_wav/{}.wav'.format(line[1]))
|
|
target = os.path.join(main_path, 'dataset/persian_data/test_data/book-1/speaker-{0}/utterance-{1}.wav'.format(line[2], line[1]))
|
|
os.makedirs(os.path.dirname(target), exist_ok=True)
|
|
shutil.copyfile(original, target)
|
|
except Exception as e:
|
|
print(e)
|
|
return False
|
|
|
|
path = os.path.join(main_path, 'dataset/persian_data/test_data/book-1/speaker-{0}/utterance-{1}.txt'.format(line[2], line[1])
|
|
with open(path, 'w') as fp:
|
|
fp.write(line[0])
|
|
return True
|
|
|
|
def main(data_path):
|
|
if os.path.isfile(os.path.join(data_path, 'train_info.csv')):
|
|
train_data_path = os.path.join(data_path, 'train_info.csv')
|
|
else:
|
|
print('data_path is not correct!')
|
|
return -1
|
|
if os.path.isfile(os.path.join(data_path, 'test_info.csv')):
|
|
test_data_path = os.path.join(data_path, 'test_info.csv')
|
|
else:
|
|
print('data_path is not correct!')
|
|
return -1
|
|
train_data = prepare_data_for_model(train_data_path)
|
|
test_data = prepare_data_for_model(test_data_path)
|
|
print('number of train data: ' + str(len(train_data)))
|
|
print('number of test data: ' + str(len(test_data)))
|
|
|
|
res = save_files(train_data, test_data, data_path)
|
|
if res:
|
|
print('Data is created.')
|
|
|
|
if __name__ == "__main__":
|
|
main(sys.argv[1]) |