asigalov61 commited on
Commit
f688095
·
verified ·
1 Parent(s): 43fce65

Upload Orpheus_Music_Transformer_Training_Dataset_Maker.ipynb

Browse files
training_data/Orpheus_Music_Transformer_Training_Dataset_Maker.ipynb ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "gradient": {
7
+ "editing": false,
8
+ "id": "ac5a4cf0-d9d2-47b5-9633-b53f8d99a4d2",
9
+ "kernelId": ""
10
+ },
11
+ "id": "SiTIpPjArIyr"
12
+ },
13
+ "source": [
14
+ "# Orpheus Music Transformer Training Dataset Maker (ver. 1.0)\n",
15
+ "\n",
16
+ "***\n",
17
+ "\n",
18
+ "Powered by tegridy-tools: https://github.com/asigalov61/tegridy-tools\n",
19
+ "\n",
20
+ "***\n",
21
+ "\n",
22
+ "#### Project Los Angeles\n",
23
+ "\n",
24
+ "#### Tegridy Code 2025\n",
25
+ "\n",
26
+ "***"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "metadata": {
32
+ "gradient": {
33
+ "editing": false,
34
+ "id": "fa0a611c-1803-42ae-bdf6-a49b5a4e781b",
35
+ "kernelId": ""
36
+ },
37
+ "id": "gOd93yV0sGd2"
38
+ },
39
+ "source": [
40
+ "# (SETUP ENVIRONMENT)"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "metadata": {
47
+ "cellView": "form",
48
+ "gradient": {
49
+ "editing": false,
50
+ "id": "a1a45a91-d909-4fd4-b67a-5e16b971d179",
51
+ "kernelId": ""
52
+ },
53
+ "id": "fX12Yquyuihc",
54
+ "scrolled": true
55
+ },
56
+ "outputs": [],
57
+ "source": [
58
+ "#@title Install all dependencies (run only once per session)\n",
59
+ "\n",
60
+ "!git clone https://github.com/asigalov61/tegridy-tools\n",
61
+ "!pip install tqdm"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "metadata": {
68
+ "cellView": "form",
69
+ "gradient": {
70
+ "editing": false,
71
+ "id": "b8207b76-9514-4c07-95db-95a4742e52c5",
72
+ "kernelId": ""
73
+ },
74
+ "id": "z7n9vnKmug1J",
75
+ "scrolled": true
76
+ },
77
+ "outputs": [],
78
+ "source": [
79
+ "#@title Import all needed modules\n",
80
+ "\n",
81
+ "print('Loading needed modules. Please wait...')\n",
82
+ "import os\n",
83
+ "import copy\n",
84
+ "import math\n",
85
+ "import statistics\n",
86
+ "import random\n",
87
+ "import pickle\n",
88
+ "\n",
89
+ "from collections import Counter\n",
90
+ "\n",
91
+ "from tqdm import tqdm\n",
92
+ "\n",
93
+ "print('Loading TMIDIX module...')\n",
94
+ "%cd tegridy-tools/tegridy-tools\n",
95
+ "\n",
96
+ "import TMIDIX\n",
97
+ "\n",
98
+ "%cd /home/ubuntu/\n",
99
+ "\n",
100
+ "print('Done!')\n",
101
+ "print('Enjoy! :)')"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "markdown",
106
+ "metadata": {},
107
+ "source": [
108
+ "# (Download and untar full Godzilla MIDI Dataset)"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "markdown",
113
+ "metadata": {},
114
+ "source": [
115
+ "## https://huggingface.co/datasets/projectlosangeles/Godzilla-MIDI-Dataset"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "markdown",
120
+ "metadata": {
121
+ "id": "JwrqQeie08t0"
122
+ },
123
+ "source": [
124
+ "# (FILE LIST)"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "metadata": {
131
+ "cellView": "form",
132
+ "id": "DuVWtdDNcqKh",
133
+ "scrolled": true
134
+ },
135
+ "outputs": [],
136
+ "source": [
137
+ "filez = TMIDIX.create_files_list(['./Godzilla-MIDI-Dataset/MIDIs/', './Godzilla-Piano-MIDI-Dataset/'])\n",
138
+ "\n",
139
+ "TMIDIX.Tegridy_Any_Pickle_File_Writer(filez, '/home/ubuntu/filez')"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "metadata": {
146
+ "cellView": "form",
147
+ "id": "qI_adhjojrJ9",
148
+ "scrolled": true
149
+ },
150
+ "outputs": [],
151
+ "source": [
152
+ "#@title Load file list\n",
153
+ "filez = TMIDIX.Tegridy_Any_Pickle_File_Reader('/home/ubuntu/filez')"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "metadata": {
160
+ "scrolled": true
161
+ },
162
+ "outputs": [],
163
+ "source": [
164
+ "len(filez)"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "markdown",
169
+ "metadata": {
170
+ "id": "FLxHvO-wlwfU"
171
+ },
172
+ "source": [
173
+ "# (PROCESS)"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "def TMIDIX_MIDI_Processor(midi_file):\n",
183
+ "\n",
184
+ " try:\n",
185
+ " \n",
186
+ " raw_score = TMIDIX.midi2single_track_ms_score(midi_file)\n",
187
+ " \n",
188
+ " escore_notes = TMIDIX.advanced_score_processor(raw_score, return_enhanced_score_notes=True, apply_sustain=True)\n",
189
+ " \n",
190
+ " if escore_notes:\n",
191
+ " \n",
192
+ " escore_notes = TMIDIX.augment_enhanced_score_notes(escore_notes[0], sort_drums_last=True)\n",
193
+ " \n",
194
+ " instruments_list = sorted(set([y[6] for y in escore_notes]))\n",
195
+ " instruments_list_without_drums = [i for i in instruments_list if i != 128]\n",
196
+ " \n",
197
+ " if instruments_list_without_drums and len(escore_notes) > 255:\n",
198
+ " \n",
199
+ " escore_notes_without_drums = [e for e in escore_notes if e[3] != 9]\n",
200
+ " \n",
201
+ " durs_counts = TMIDIX.escore_notes_durations_counter(escore_notes_without_drums, min_duration=128)\n",
202
+ " \n",
203
+ " if (durs_counts[0] / durs_counts[1]) < 0.1 and (durs_counts[2] / durs_counts[1]) < 0.1:\n",
204
+ " \n",
205
+ " escore_notes_times = [e[1] for e in escore_notes_without_drums]\n",
206
+ " \n",
207
+ " escore_notes_tones = sorted(set([e[4] % 12 for e in escore_notes_without_drums]))\n",
208
+ " \n",
209
+ " if len(escore_notes_times) > (len(set(escore_notes_times)) * 1.1) and len(escore_notes_tones) > 4:\n",
210
+ " \n",
211
+ " escore_notes_velocities = [e[5] for e in escore_notes]\n",
212
+ " \n",
213
+ " avg_escore_notes_velocity = sum(escore_notes_velocities) / len(escore_notes_velocities)\n",
214
+ " \n",
215
+ " if avg_escore_notes_velocity < 64:\n",
216
+ " TMIDIX.adjust_score_velocities(escore_notes, 124)\n",
217
+ "\n",
218
+ " dscore = TMIDIX.delta_score_notes(escore_notes)\n",
219
+ " \n",
220
+ " dcscore = TMIDIX.chordify_score([d[1:] for d in dscore])\n",
221
+ "\n",
222
+ " bad_chords_counts = TMIDIX.count_bad_chords_in_chordified_score(dcscore, pitches_index=3, patches_index=5)\n",
223
+ "\n",
224
+ " if (bad_chords_counts[0] / bad_chords_counts[1]) < 0.15:\n",
225
+ " \n",
226
+ " #=======================================================\n",
227
+ " # FINAL PROCESSING\n",
228
+ " #=======================================================\n",
229
+ " \n",
230
+ " melody_chords = [18816]\n",
231
+ " \n",
232
+ " #=======================================================\n",
233
+ " # MAIN PROCESSING CYCLE\n",
234
+ " #=======================================================\n",
235
+ " \n",
236
+ " for i, c in enumerate(dcscore):\n",
237
+ " \n",
238
+ " # Outro seq\n",
239
+ " if len(dcscore)-i == 64 and len(dcscore) > 191:\n",
240
+ " melody_chords.extend([18817])\n",
241
+ " \n",
242
+ " # Delta start-times\n",
243
+ " \n",
244
+ " delta_time = c[0][0]\n",
245
+ " \n",
246
+ " melody_chords.append(delta_time)\n",
247
+ " \n",
248
+ " for e in c:\n",
249
+ " \n",
250
+ " #=======================================================\n",
251
+ " \n",
252
+ " # Durations\n",
253
+ " dur = max(1, min(255, e[1]))\n",
254
+ " \n",
255
+ " # Patches\n",
256
+ " pat = max(0, min(128, e[5]))\n",
257
+ " \n",
258
+ " # Pitches\n",
259
+ " ptc = max(1, min(127, e[3]))\n",
260
+ " \n",
261
+ " # Velocities\n",
262
+ " # Calculating octo-velocity\n",
263
+ " \n",
264
+ " vel = max(8, min(127, e[4]))\n",
265
+ " velocity = round(vel / 15)-1\n",
266
+ " \n",
267
+ " #=======================================================\n",
268
+ " # FINAL NOTE SEQ\n",
269
+ " #=======================================================\n",
270
+ " \n",
271
+ " # Writing final note\n",
272
+ " pat_ptc = (128 * pat) + ptc \n",
273
+ " dur_vel = (8 * dur) + velocity\n",
274
+ " \n",
275
+ " melody_chords.extend([pat_ptc+256, dur_vel+16768]) # 18816\n",
276
+ "\n",
277
+ " if len(melody_chords) > 8192:\n",
278
+ " break\n",
279
+ " \n",
280
+ " melody_chords.extend([18818])\n",
281
+ "\n",
282
+ " return melody_chords\n",
283
+ "\n",
284
+ " \n",
285
+ " except Exception as ex:\n",
286
+ " print(midi_file)\n",
287
+ " print(ex)\n",
288
+ " return None"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": null,
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "!mkdir DATA"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": null,
303
+ "metadata": {
304
+ "scrolled": true
305
+ },
306
+ "outputs": [],
307
+ "source": [
308
+ "print('=' * 70)\n",
309
+ "print('TMIDIX MIDI Processor')\n",
310
+ "print('=' * 70)\n",
311
+ "print('Starting up...')\n",
312
+ "print('=' * 70)\n",
313
+ "\n",
314
+ "###########\n",
315
+ "\n",
316
+ "NUMBER_OF_FILES_PER_ITERATION = 25000\n",
317
+ "\n",
318
+ "files_count = 0\n",
319
+ "\n",
320
+ "print('Processing MIDI files. Please wait...')\n",
321
+ "print('=' * 70)\n",
322
+ "\n",
323
+ "for i in range(0, len(filez), NUMBER_OF_FILES_PER_ITERATION):\n",
324
+ "\n",
325
+ " print('=' * 70)\n",
326
+ " print('Processing block #', (i // NUMBER_OF_FILES_PER_ITERATION)+1, '/', (len(filez) // NUMBER_OF_FILES_PER_ITERATION)+1)\n",
327
+ " print('=' * 70)\n",
328
+ "\n",
329
+ " output = TMIDIX.multiprocessing_wrapper(TMIDIX_MIDI_Processor, filez[i:i+NUMBER_OF_FILES_PER_ITERATION])\n",
330
+ "\n",
331
+ " melody_chords_f = set()\n",
332
+ "\n",
333
+ " for o in output:\n",
334
+ " if o:\n",
335
+ " melody_chords_f.add(tuple(o))\n",
336
+ "\n",
337
+ " melody_chords_f = list(melody_chords_f)\n",
338
+ "\n",
339
+ " files_count += len(melody_chords_f)\n",
340
+ " print('SAVING !!!')\n",
341
+ " print('=' * 70)\n",
342
+ " print('Saving processed files...')\n",
343
+ " print('=' * 70)\n",
344
+ " print('Data check:', min(melody_chords_f[0]), '===', max(melody_chords_f[0]), '===', len(list(set(melody_chords_f[0]))), '===', len(melody_chords_f[0]))\n",
345
+ " print('=' * 70)\n",
346
+ " print('Processed so far:', files_count, 'out of', len(filez), '===', files_count / len(filez), 'good files ratio')\n",
347
+ " print('=' * 70)\n",
348
+ " count = str(files_count)\n",
349
+ " TMIDIX.Tegridy_Any_Pickle_File_Writer(melody_chords_f, '/home/ubuntu/DATA/ORPHEUS_INTs_'+count)\n",
350
+ " print('=' * 70)\n",
351
+ "\n",
352
+ "files_count += len(melody_chords_f)\n",
353
+ "print('SAVING !!!')\n",
354
+ "print('=' * 70)\n",
355
+ "print('Saving processed files...')\n",
356
+ "print('=' * 70)\n",
357
+ "print('Data check:', min(melody_chords_f[0]), '===', max(melody_chords_f[0]), '===', len(list(set(melody_chords_f[0]))), '===', len(melody_chords_f[0]))\n",
358
+ "print('=' * 70)\n",
359
+ "print('Processed so far:', files_count, 'out of', len(filez), '===', files_count / len(filez), 'good files ratio')\n",
360
+ "print('=' * 70)\n",
361
+ "count = str(files_count)\n",
362
+ "TMIDIX.Tegridy_Any_Pickle_File_Writer(melody_chords_f, '/home/ubuntu/DATA/ORPHEUS_INTs_'+count)\n",
363
+ "print('=' * 70)"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "markdown",
368
+ "metadata": {
369
+ "id": "-ye9rNzOHX90"
370
+ },
371
+ "source": [
372
+ "# (TEST INTS)"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": null,
378
+ "metadata": {},
379
+ "outputs": [],
380
+ "source": [
381
+ "train_data1 = melody_chords_f[0]\n",
382
+ "\n",
383
+ "print('Sample INTs', train_data1[:15])\n",
384
+ "\n",
385
+ "out = train_data1\n",
386
+ "\n",
387
+ "if len(out) != 0:\n",
388
+ " \n",
389
+ " song = out\n",
390
+ " song_f = []\n",
391
+ " \n",
392
+ " time = 0\n",
393
+ " dur = 0\n",
394
+ " vel = 90\n",
395
+ " pitch = 60\n",
396
+ " channel = 0\n",
397
+ " patch = 0\n",
398
+ "\n",
399
+ " patches = [-1] * 16\n",
400
+ "\n",
401
+ " channels = [0] * 16\n",
402
+ " channels[9] = 1\n",
403
+ "\n",
404
+ " for ss in song:\n",
405
+ "\n",
406
+ " if 0 <= ss < 256:\n",
407
+ "\n",
408
+ " time += ss * 16\n",
409
+ "\n",
410
+ " if 256 <= ss < 16768:\n",
411
+ "\n",
412
+ " patch = (ss-256) // 128\n",
413
+ "\n",
414
+ " if patch < 128:\n",
415
+ "\n",
416
+ " if patch not in patches:\n",
417
+ " if 0 in channels:\n",
418
+ " cha = channels.index(0)\n",
419
+ " channels[cha] = 1\n",
420
+ " else:\n",
421
+ " cha = 15\n",
422
+ "\n",
423
+ " patches[cha] = patch\n",
424
+ " channel = patches.index(patch)\n",
425
+ " else:\n",
426
+ " channel = patches.index(patch)\n",
427
+ "\n",
428
+ " if patch == 128:\n",
429
+ " channel = 9\n",
430
+ "\n",
431
+ " pitch = (ss-256) % 128\n",
432
+ "\n",
433
+ "\n",
434
+ " if 16768 <= ss < 18816:\n",
435
+ "\n",
436
+ " dur = ((ss-16768) // 8) * 16\n",
437
+ " vel = (((ss-16768) % 8)+1) * 15\n",
438
+ "\n",
439
+ " song_f.append(['note', time, dur, channel, pitch, vel ])\n",
440
+ " \n",
441
+ "patches = [0 if x==-1 else x for x in patches]\n",
442
+ "\n",
443
+ "detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,\n",
444
+ " output_signature = 'Orpheus Music Transformer', \n",
445
+ " output_file_name = '/home/ubuntu/Orpheus-Music-TransformerComposition', \n",
446
+ " track_name='Project Los Angeles',\n",
447
+ " list_of_MIDI_patches=patches\n",
448
+ " )\n",
449
+ "\n",
450
+ "print('Done!')"
451
+ ]
452
+ },
453
+ {
454
+ "cell_type": "markdown",
455
+ "metadata": {
456
+ "id": "YzCMd94Tu_gz"
457
+ },
458
+ "source": [
459
+ "# Congrats! You did it! :)"
460
+ ]
461
+ }
462
+ ],
463
+ "metadata": {
464
+ "colab": {
465
+ "machine_shape": "hm",
466
+ "private_outputs": true,
467
+ "provenance": []
468
+ },
469
+ "gpuClass": "standard",
470
+ "kernelspec": {
471
+ "display_name": "Python 3 (ipykernel)",
472
+ "language": "python",
473
+ "name": "python3"
474
+ },
475
+ "language_info": {
476
+ "codemirror_mode": {
477
+ "name": "ipython",
478
+ "version": 3
479
+ },
480
+ "file_extension": ".py",
481
+ "mimetype": "text/x-python",
482
+ "name": "python",
483
+ "nbconvert_exporter": "python",
484
+ "pygments_lexer": "ipython3",
485
+ "version": "3.10.12"
486
+ }
487
+ },
488
+ "nbformat": 4,
489
+ "nbformat_minor": 4
490
+ }