jimregan commited on
Commit
3e35eeb
·
1 Parent(s): b6dccb9

add processing notebook

Browse files
Files changed (1) hide show
  1. waxholm-phoneme-fairseq.ipynb +317 -0
waxholm-phoneme-fairseq.ipynb ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import soundfile as sf\n",
10
+ "import wave"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "def smp_headers(filename: str):\n",
20
+ " with open(filename, \"rb\") as f:\n",
21
+ " f.seek(0)\n",
22
+ " raw_headers = f.read(1024)\n",
23
+ " raw_headers = raw_headers.rstrip(b'\\x00')\n",
24
+ " asc_headers = raw_headers.decode(\"ascii\")\n",
25
+ " asc_headers.rstrip('\\x00')\n",
26
+ " tmp = [a for a in asc_headers.split(\"\\r\\n\")]\n",
27
+ " back = -1\n",
28
+ " while abs(back) > len(tmp) + 1:\n",
29
+ " if tmp[back] == '=':\n",
30
+ " break\n",
31
+ " back -= 1\n",
32
+ " tmp = tmp[0:back-1]\n",
33
+ " return dict(a.split(\"=\") for a in tmp)\n",
34
+ "\n",
35
+ "\n",
36
+ "def smp_read_sf(filename: str):\n",
37
+ " headers = smp_headers(filename)\n",
38
+ " if headers[\"msb\"] == \"last\":\n",
39
+ " ENDIAN = \"LITTLE\"\n",
40
+ " else:\n",
41
+ " ENDIAN = \"BIG\"\n",
42
+ "\n",
43
+ " data, sr = sf.read(filename, channels=int(headers[\"nchans\"]),\n",
44
+ " samplerate=16000, endian=ENDIAN, start=512,\n",
45
+ " dtype=\"int16\", format=\"RAW\", subtype=\"PCM_16\")\n",
46
+ " return (data, sr)\n",
47
+ "\n",
48
+ "\n",
49
+ "def write_wav(filename, arr):\n",
50
+ " with wave.open(filename, \"w\") as f:\n",
51
+ " f.setnchannels(1)\n",
52
+ " f.setsampwidth(2)\n",
53
+ " f.setframerate(16000)\n",
54
+ " f.writeframes(arr)"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 3,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "from pathlib import Path"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 6,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "WAXHOLM = \"/Users/joregan/Playing/waxholm\"\n",
73
+ "OUTPUT = \"/Users/joregan/Playing/waxholm_fairseq\""
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 7,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "SCENES_PATH = Path(WAXHOLM) / \"scenes_formatted\"\n",
83
+ "OUTPUT_PATH = Path(OUTPUT)\n",
84
+ "if not OUTPUT_PATH.is_dir():\n",
85
+ " OUTPUT_PATH.mkdir()"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 12,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "TRAIN_FILES = []\n",
95
+ "with open(Path(WAXHOLM) / \"alloktrainfiles\") as trainf:\n",
96
+ " for line in trainf.readlines():\n",
97
+ " TRAIN_FILES.append(line.strip())\n",
98
+ "TEST_FILES = []\n",
99
+ "with open(Path(WAXHOLM) / \"testfiles\") as testf:\n",
100
+ " for line in testf.readlines():\n",
101
+ " TEST_FILES.append(line.strip())"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 14,
107
+ "metadata": {},
108
+ "outputs": [
109
+ {
110
+ "name": "stdout",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "1835 327\n"
114
+ ]
115
+ }
116
+ ],
117
+ "source": [
118
+ "print(len(TRAIN_FILES), len(TEST_FILES))"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 20,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "import re\n",
128
+ "\n",
129
+ "def get_labels(mixfile):\n",
130
+ " labels = \"\"\n",
131
+ " saw_label = False\n",
132
+ " with open(mixfile) as infile:\n",
133
+ " for line in infile.readlines():\n",
134
+ " if not saw_label:\n",
135
+ " if line.lower().startswith(\"labels:\"):\n",
136
+ " saw_label = True\n",
137
+ " labels = line[7:].strip()\n",
138
+ " else:\n",
139
+ " if line.startswith(\"FR\"):\n",
140
+ " break\n",
141
+ " else:\n",
142
+ " labels = \" \".join([labels, line.strip()])\n",
143
+ " labels = re.sub(\" +\", \" \", labels)\n",
144
+ " return labels"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 21,
150
+ "metadata": {},
151
+ "outputs": [
152
+ {
153
+ "data": {
154
+ "text/plain": [
155
+ "'A:H\\'A: pa p: |h J\\'A:Ggv V\\'ILv pap: sm p:v S\\'E: pa H\\'U:R 2Dd\\'EM Bb\\']:TtE0NG Gg\\']:R 2Tt\\'I STt\"A:VE0#STtR`\\\\M p: \\']: p: \\']M J\\'A: Kk\\'AN F\"O#2S`[TtA Tt\\'I F\"IN#H`AM .'"
156
+ ]
157
+ },
158
+ "execution_count": 21,
159
+ "metadata": {},
160
+ "output_type": "execute_result"
161
+ }
162
+ ],
163
+ "source": [
164
+ "get_labels(\"/Users/joregan/Playing/waxholm/scenes_formatted/fp2043/fp2043.16.03.smp.mix\")"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 42,
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "def segment_label(label, skip_pause=True):\n",
174
+ " phones = []\n",
175
+ " i = 0\n",
176
+ " while i < len(label):\n",
177
+ " start_i = i\n",
178
+ " end_i = i\n",
179
+ " if label[i:i+2] in [\"NG\", \"E0\", \"kl\", \"sm\"]:\n",
180
+ " phones.append(label[i:i+2])\n",
181
+ " i += 2\n",
182
+ " elif label[i:i+2] == \"p:\":\n",
183
+ " if not skip_pause:\n",
184
+ " phones.append(\"p:\")\n",
185
+ " i += 2\n",
186
+ " elif label[i:i+1] == \"#\":\n",
187
+ " i += 1\n",
188
+ " else:\n",
189
+ " if label[i:i+1] in [\"'\", \"`\", \"\\\"\", \"2\", \"~\"]:\n",
190
+ " i += 1\n",
191
+ " end_i += 1\n",
192
+ " if label[i+1:i+2] in [\":\", \"3\", \"4\"]:\n",
193
+ " end_i += 1\n",
194
+ " phones.append(label[start_i:end_i+1])\n",
195
+ " i = end_i + 1\n",
196
+ " return phones\n"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 46,
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "assert segment_label(\"Bb\\']:TtE0NG\") == ['B', 'b', \"']:\", 'T', 't', 'E0', 'NG']\n",
206
+ "assert segment_label(\"STt\\\"A:VE0#STtR`\\\\M\") == ['S', 'T', 't', '\"A:', 'V', 'E0', 'S', 'T', 't', 'R', '`\\\\', 'M']\n",
207
+ "assert segment_label(\"p:v\") == ['v']"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 56,
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "def proc_label(label, stress=False):\n",
217
+ " def strip_stress(phone, stress):\n",
218
+ " if stress:\n",
219
+ " return phone\n",
220
+ " if phone[0] in [\"'\", \"`\", \"\\\"\"]:\n",
221
+ " return phone[1:]\n",
222
+ " else:\n",
223
+ " return phone\n",
224
+ " words = []\n",
225
+ " for word in label.split(\" \"):\n",
226
+ " if word in [\"p:pa\", \"pap:\", \"p:pap:\", \"pa\"]:\n",
227
+ " words.append(\"pa\")\n",
228
+ " elif word == \"p:\" or word == \".\":\n",
229
+ " continue\n",
230
+ " elif word == \"|h\":\n",
231
+ " words.append(\"hes\")\n",
232
+ " elif word in [\"sm\", \"ha\", \"kl\"]:\n",
233
+ " words.append(word)\n",
234
+ " else:\n",
235
+ " phones = [strip_stress(p, stress) for p in segment_label(word)]\n",
236
+ " words.append(\" \".join(phones))\n",
237
+ " return(\" | \".join(words)) + \" |\""
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": 57,
243
+ "metadata": {},
244
+ "outputs": [
245
+ {
246
+ "name": "stdout",
247
+ "output_type": "stream",
248
+ "text": [
249
+ "A:H'A: pa p: |h J'A:Ggv V'ILv pap: sm p:v S'E: pa H'U:R 2Dd'EM Bb']:TtE0NG Gg']:R 2Tt'I STt\"A:VE0#STtR`\\M p: ']: p: ']M J'A: Kk'AN F\"O#2S`[TtA Tt'I F\"IN#H`AM .\n",
250
+ "A: H A: | pa | hes | J A: G g v | V I L v | pa | sm | v | S E: | pa | H U: R | 2D d E M | B b ]: T t E0 NG | G g ]: R | 2T t I | S T t A: V E0 S T t R \\ M | ]: | ] M | J A: | K k A N | F O 2S [ T t A | T t I | F I N H A M |\n"
251
+ ]
252
+ }
253
+ ],
254
+ "source": [
255
+ "lbl = get_labels(\"/Users/joregan/Playing/waxholm/scenes_formatted/fp2043/fp2043.16.03.smp.mix\")\n",
256
+ "plbl = proc_label(lbl)\n",
257
+ "print(lbl)\n",
258
+ "print(plbl)"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": 59,
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "with open(OUTPUT_PATH / \"train.tsv\", \"w\") as train_tsv,\\\n",
268
+ " open(OUTPUT_PATH / \"train.ltr\", \"w\") as train_ltr,\\\n",
269
+ " open(OUTPUT_PATH / \"test.tsv\", \"w\") as test_tsv,\\\n",
270
+ " open(OUTPUT_PATH / \"test.ltr\", \"w\") as test_ltr:\n",
271
+ " for smpfile in SCENES_PATH.glob(\"fp*/*.smp\"):\n",
272
+ " mixfile = f\"{smpfile}.mix\"\n",
273
+ " if not Path(mixfile).exists():\n",
274
+ " continue\n",
275
+ " stem = smpfile.stem\n",
276
+ " if f\"{stem}.smp\" in TEST_FILES:\n",
277
+ " out_tsv = test_tsv\n",
278
+ " out_ltr = test_ltr\n",
279
+ " else:\n",
280
+ " out_tsv = train_tsv\n",
281
+ " out_ltr = train_ltr\n",
282
+ "\n",
283
+ " outwav = str(OUTPUT_PATH / f\"{stem}.wav\")\n",
284
+ " arr, sr = smp_read_sf(str(smpfile))\n",
285
+ " out_tsv.write(f\"{outwav}\\t{len(arr)}\\n\")\n",
286
+ " write_wav(outwav, arr)\n",
287
+ " label = get_labels(mixfile)\n",
288
+ " ltrline = proc_label(label)\n",
289
+ " out_ltr.write(ltrline + \"\\n\")\n",
290
+ " \n",
291
+ " "
292
+ ]
293
+ }
294
+ ],
295
+ "metadata": {
296
+ "kernelspec": {
297
+ "display_name": "hf",
298
+ "language": "python",
299
+ "name": "python3"
300
+ },
301
+ "language_info": {
302
+ "codemirror_mode": {
303
+ "name": "ipython",
304
+ "version": 3
305
+ },
306
+ "file_extension": ".py",
307
+ "mimetype": "text/x-python",
308
+ "name": "python",
309
+ "nbconvert_exporter": "python",
310
+ "pygments_lexer": "ipython3",
311
+ "version": "3.9.15"
312
+ },
313
+ "orig_nbformat": 4
314
+ },
315
+ "nbformat": 4,
316
+ "nbformat_minor": 2
317
+ }