# Orpheus Music Transformer Maker (ver. 1.0)

***

Powered by tegridy-tools: https://github.com/asigalov61/tegridy-tools

***

WARNING: This complete implementation is a functioning model of the Artificial Intelligence. Please excercise great humility, care, and respect. https://www.nscai.gov/

***

#### Project Los Angeles

#### Tegridy Code 2025

***

# GPU check

In [None]:
!nvidia-smi

# Setup environment

In [None]:
!git clone --depth 1 https://github.com/asigalov61/tegridy-tools

In [None]:
!pip install huggingface_hub
!pip install hf-transfer
!pip install ipywidgets
!pip install -U tqdm

!pip install einx
!pip install einops
!pip install torch-summary

In [None]:
# Load modules and make data dir

print('Loading modules...')

import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import pickle
import random
import secrets
import tqdm
import math

import gc

!set USE_FLASH_ATTENTION=1
os.environ['USE_FLASH_ATTENTION'] = '1'

import torch
import torch.optim as optim

from torch.utils.data import DataLoader, Dataset

import matplotlib.pyplot as plt

from torchsummary import summary
from sklearn import metrics

%cd /home/ubuntu/tegridy-tools/tegridy-tools/

import TMIDIX

%cd /home/ubuntu/tegridy-tools/tegridy-tools/X-Transformer

from x_transformer_2_3_1 import *

torch.set_float32_matmul_precision('medium')
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
torch.backends.cuda.enable_flash_sdp(True)
torch.backends.cuda.enable_cudnn_sdp(False)

!set USE_FLASH_ATTENTION=1

%cd /home/ubuntu/

if not os.path.exists('/home/ubuntu/DATA'):
 os.makedirs('/home/ubuntu/DATA')

import random

print('Done')

print('Torch version:', torch.__version__)

# Prep training data

* Put all training dataset pickle files into ./DATA folder

## Data files List

In [None]:
dataset_addr = "/home/ubuntu/DATA"

#==========================================================================

filez = list()
for (dirpath, dirnames, filenames) in os.walk(dataset_addr):
 filez += [os.path.join(dirpath, file) for file in filenames if file.endswith('.pickle')]
print('=' * 70)

random.shuffle(filez)

print('Loaded', len(filez), 'data files')
print('=' * 70)

## Load training data files

In [None]:
SEQ_LEN = 8192
PAD_IDX = 18819 # Model pad index

#==========================================================================

print('=' * 70)
print('Loading data files...')
print('Please wait...')
print('=' * 70)

train_data = set()

chunks_counter = 0

gc.disable()

for lfa in tqdm.tqdm(filez):

 train_d = pickle.load(open(lfa, 'rb'))

 for t in train_d:

 if 0 <= max(t) < PAD_IDX: # final data integrity check
 train_data.add(tuple(t))
 chunks_counter += 1
 
 else:
 print('Bad data!!!')

gc.enable()
gc.collect()

train_data = list(train_data)

#==========================================================================

print('Done!')
print('=' * 70)
print('Total number of main chunks:', chunks_counter)
print('All data is good:', len(max(train_data, key=len)) == len(min(train_data, key=len)))
print('=' * 70)
print('Sorting by length...')
print('Randomizing train data...')
random.shuffle(train_data)
print('Done!')
print('=' * 70)
print('Total length of train data:', len(train_data))
print('=' * 70)

In [None]:
train_data = list(train_data)

In [None]:
len(train_data[0])

In [None]:
train_data[0][:15]

# Setup model

In [None]:
# Setup model

# constants

VALIDATE_EVERY = 100
SAVE_EVERY = 500
GENERATE_EVERY = 250
GENERATE_LENGTH = 512
PRINT_STATS_EVERY = 10

NUM_EPOCHS = 5

BATCH_SIZE = 9
GRADIENT_ACCUMULATE_EVERY = 8

LEARNING_RATE = 1e-4
GRAD_CLIP = 1.0

# instantiate the model

model = TransformerWrapper(
 num_tokens = PAD_IDX+1,
 max_seq_len = SEQ_LEN,
 attn_layers = Decoder(dim = 2048,
 depth = 8,
 heads = 32,
 rotary_pos_emb = True,
 attn_flash = True,
 )
 )

model = AutoregressiveWrapper(model, ignore_index = PAD_IDX, pad_value=PAD_IDX)

model.cuda()

print('Done!')

summary(model)

# Dataloader

def get_train_data_batch(tdata, index, seq_len, batch_size, pad_idx):

 batch = tdata[(index*batch_size):(index*batch_size)+batch_size]

 padded_batch = []

 for ba in batch:

 ba = list(ba)

 if len(ba) > (seq_len+1):
 ba = ba[:(seq_len+1)]

 else:
 ba += [pad_idx] * ((seq_len+1) - len(ba[:(seq_len+1)]))

 padded_batch.append(ba)

 return torch.LongTensor(padded_batch).cuda()

# precision/optimizer/scaler

dtype = torch.bfloat16

ctx = torch.amp.autocast(device_type='cuda', dtype=dtype)

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

scaler = torch.amp.GradScaler('cuda')

# Train

In [None]:
# Train the model

train_losses = []
val_losses = []

train_accs = []
val_accs = []

nsteps = 0

for ep in range(NUM_EPOCHS):

 print('=' * 70)
 print('Randomizing train data...')
 random.shuffle(train_data)
 print('=' * 70)

 print('=' * 70)
 print('Epoch #', ep+1)
 print('=' * 70)

 NUM_BATCHES = len(train_data) // BATCH_SIZE // GRADIENT_ACCUMULATE_EVERY

 model.train()

 for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='Training'):

 optim.zero_grad()

 for j in range(GRADIENT_ACCUMULATE_EVERY):
 with ctx:
 loss, acc = model(get_train_data_batch(train_data, (i*GRADIENT_ACCUMULATE_EVERY)+j, SEQ_LEN, BATCH_SIZE, PAD_IDX))
 loss = loss / GRADIENT_ACCUMULATE_EVERY
 scaler.scale(loss).backward()

 if i % PRINT_STATS_EVERY == 0:
 print(f'Training loss: {loss.item() * GRADIENT_ACCUMULATE_EVERY}')
 print(f'Training acc: {acc.item()}')

 train_losses.append(loss.item() * GRADIENT_ACCUMULATE_EVERY)
 train_accs.append(acc.item())

 scaler.unscale_(optim)
 torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
 scaler.step(optim)
 scaler.update()

 nsteps += 1

 if i % VALIDATE_EVERY == 0:
 model.eval()
 with torch.no_grad():
 with ctx:
 val_loss, val_acc = model(get_train_data_batch(train_data, i, SEQ_LEN, BATCH_SIZE, PAD_IDX))

 print(f'Validation loss: {val_loss.item()}')
 print(f'Validation acc: {val_acc.item()}')

 val_losses.append(val_loss.item())
 val_accs.append(val_acc.item())

 print('Plotting training loss graph...')

 tr_loss_list = train_losses
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 print('Plotting training acc graph...')

 tr_loss_list = train_accs
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 print('Plotting validation loss graph...')
 tr_loss_list = val_losses
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 print('Plotting validation acc graph...')
 tr_loss_list = val_accs
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 model.train()

 if i % GENERATE_EVERY == 0:
 model.eval()

 inp = random.choice(get_train_data_batch(train_data, i, SEQ_LEN, BATCH_SIZE, PAD_IDX))[:GENERATE_LENGTH]

 print(inp)

 with ctx:
 sample = model.generate(inp[None, ...], GENERATE_LENGTH)

 print(sample)

 data = sample.tolist()[0]

 print('Sample INTs', data[:15])

 if len(data) != 0:

 song = data
 song_f = []

 time = 0
 dur = 1
 vel = 90
 pitch = 60
 channel = 0
 patch = 0
 
 patches = [-1] * 16
 
 channels = [0] * 16
 channels[9] = 1
 
 for ss in song:
 
 if 0 <= ss < 256:
 
 time += ss * 16
 
 if 256 <= ss < 16768:
 
 patch = (ss-256) // 128
 
 if patch < 128:
 
 if patch not in patches:
 if 0 in channels:
 cha = channels.index(0)
 channels[cha] = 1
 else:
 cha = 15
 
 patches[cha] = patch
 channel = patches.index(patch)
 else:
 channel = patches.index(patch)
 
 if patch == 128:
 channel = 9
 
 pitch = (ss-256) % 128
 
 
 if 16768 <= ss < 18816:
 
 dur = ((ss-16768) // 8) * 16
 vel = (((ss-16768) % 8)+1) * 15
 
 song_f.append(['note', time, dur, channel, pitch, vel ])

 patches = [0 if x==-1 else x for x in patches]

 detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,
 output_signature = 'Orpheus Music Transformer',
 output_file_name = '/home/ubuntu/Orpheus-Music-Transformer-Composition',
 track_name='Project Los Angeles',
 list_of_MIDI_patches=patches
 )

 print('Done!')

 model.train()

 if i % SAVE_EVERY == 0:

 print('Saving model progress. Please wait...')
 print('model_checkpoint_' + str(nsteps) + '_steps_' + str(round(float(train_losses[-1]), 4)) + '_loss_' + str(round(float(train_accs[-1]), 4)) + '_acc.pth')

 fname = '/home/ubuntu/model_checkpoint_' + str(nsteps) + '_steps_' + str(round(float(train_losses[-1]), 4)) + '_loss_' + str(round(float(train_accs[-1]), 4)) + '_acc.pth'

 torch.save(model.state_dict(), fname)

 torch.save(optim.state_dict(), '/home/ubuntu/optimizer.pth')

 torch.save(scaler.state_dict(), '/home/ubuntu/scaler.pth')

 data = [train_losses, train_accs, val_losses, val_accs]

 TMIDIX.Tegridy_Any_Pickle_File_Writer(data, '/home/ubuntu/losses_accs')

 print('Done!')

# Resume training from checkpoint

In [None]:
model_path = 'checkpoint.pth'
optim_path = 'optimizer.pth'
scaler_path = 'scaler.pth'

print('Restoring optimizer...')
optim.load_state_dict(torch.load(optim_path))

print('Restoring scaler...')
scaler.load_state_dict(torch.load(scaler_path))

print('Restoring model...')
model.load_state_dict(torch.load(model_path))

print('Done!')

In [None]:
START_EPOCHS = 0

train_losses, train_accs, val_losses, val_accs = TMIDIX.Tegridy_Any_Pickle_File_Reader('losses_accs.pickle')

nsteps = len(train_losses)

print(nsteps)
print(train_losses[-1])

In [None]:
# Train the model

for ep in range(START_EPOCHS, NUM_EPOCHS):

 print('=' * 70)
 print('Randomizing train data...')
 random.shuffle(train_data)
 print('=' * 70)

 print('=' * 70)
 print('Epoch #', ep+1)
 print('=' * 70)

 NUM_BATCHES = len(train_data) // BATCH_SIZE // GRADIENT_ACCUMULATE_EVERY

 model.train()

 for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='Training'):

 optim.zero_grad()

 for j in range(GRADIENT_ACCUMULATE_EVERY):
 with ctx:
 loss, acc = model(get_train_data_batch(train_data, (i*GRADIENT_ACCUMULATE_EVERY)+j, SEQ_LEN, BATCH_SIZE, PAD_IDX))
 loss = loss / GRADIENT_ACCUMULATE_EVERY
 scaler.scale(loss).backward()

 if i % PRINT_STATS_EVERY == 0:
 print(f'Training loss: {loss.item() * GRADIENT_ACCUMULATE_EVERY}')
 print(f'Training acc: {acc.item()}')

 train_losses.append(loss.item() * GRADIENT_ACCUMULATE_EVERY)
 train_accs.append(acc.item())

 scaler.unscale_(optim)
 torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
 scaler.step(optim)
 scaler.update()

 nsteps += 1

 if i % VALIDATE_EVERY == 0:
 model.eval()
 with torch.no_grad():
 with ctx:
 val_loss, val_acc = model(get_train_data_batch(train_data, i, SEQ_LEN, BATCH_SIZE, PAD_IDX))

 print(f'Validation loss: {val_loss.item()}')
 print(f'Validation acc: {val_acc.item()}')

 val_losses.append(val_loss.item())
 val_accs.append(val_acc.item())

 print('Plotting training loss graph...')

 tr_loss_list = train_losses
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 print('Plotting training acc graph...')

 tr_loss_list = train_accs
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 print('Plotting validation loss graph...')
 tr_loss_list = val_losses
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 print('Plotting validation acc graph...')
 tr_loss_list = val_accs
 plt.plot([i for i in range(len(tr_loss_list))] ,tr_loss_list, 'b')
 plt.show()
 plt.close()
 print('Done!')

 model.train()

 if i % GENERATE_EVERY == 0:
 model.eval()

 inp = random.choice(get_train_data_batch(train_data, i, SEQ_LEN, BATCH_SIZE, PAD_IDX))[:GENERATE_LENGTH]

 print(inp)

 with ctx:
 sample = model.generate(inp[None, ...], GENERATE_LENGTH)

 print(sample)

 data = sample.tolist()[0]

 print('Sample INTs', data[:15])

 if len(data) != 0:

 song = data
 song_f = []

 time = 0
 dur = 1
 vel = 90
 pitch = 60
 channel = 0
 patch = 0
 
 patches = [-1] * 16
 
 channels = [0] * 16
 channels[9] = 1
 
 for ss in song:
 
 if 0 <= ss < 256:
 
 time += ss * 16
 
 if 256 <= ss < 16768:
 
 patch = (ss-256) // 128
 
 if patch < 128:
 
 if patch not in patches:
 if 0 in channels:
 cha = channels.index(0)
 channels[cha] = 1
 else:
 cha = 15
 
 patches[cha] = patch
 channel = patches.index(patch)
 else:
 channel = patches.index(patch)
 
 if patch == 128:
 channel = 9
 
 pitch = (ss-256) % 128
 
 
 if 16768 <= ss < 18816:
 
 dur = ((ss-16768) // 8) * 16
 vel = (((ss-16768) % 8)+1) * 15
 
 song_f.append(['note', time, dur, channel, pitch, vel ])

 patches = [0 if x==-1 else x for x in patches]

 detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,
 output_signature = 'Orpheus Music Transformer',
 output_file_name = '/home/ubuntu/Orpheus-Music-Transformer-Composition',
 track_name='Project Los Angeles',
 list_of_MIDI_patches=patches
 )

 print('Done!')

 model.train()

 if i % SAVE_EVERY == 0:

 print('Saving model progress. Please wait...')
 print('model_checkpoint_' + str(nsteps) + '_steps_' + str(round(float(train_losses[-1]), 4)) + '_loss_' + str(round(float(train_accs[-1]), 4)) + '_acc.pth')

 fname = '/home/ubuntu/model_checkpoint_' + str(nsteps) + '_steps_' + str(round(float(train_losses[-1]), 4)) + '_loss_' + str(round(float(train_accs[-1]), 4)) + '_acc.pth'

 torch.save(model.state_dict(), fname)

 torch.save(optim.state_dict(), '/home/ubuntu/optimizer.pth')

 torch.save(scaler.state_dict(), '/home/ubuntu/scaler.pth')

 data = [train_losses, train_accs, val_losses, val_accs]

 TMIDIX.Tegridy_Any_Pickle_File_Writer(data, '/home/ubuntu/losses_accs')

 print('Done!')

# Final Save

In [None]:
print('Saving model progress. Please wait...')
print('model_checkpoint_' + str(nsteps) + '_steps_' + str(round(float(train_losses[-1]), 4)) + '_loss_' + str(round(float(train_accs[-1]), 4)) + '_acc.pth')

fname = '/home/ubuntu/model_checkpoint_' + str(nsteps) + '_steps_' + str(round(float(train_losses[-1]), 4)) + '_loss_' + str(round(float(train_accs[-1]), 4)) + '_acc.pth'

torch.save(model.state_dict(), fname)

print('Done!')

data = [train_losses, train_accs, val_losses, val_accs]

TMIDIX.Tegridy_Any_Pickle_File_Writer(data, '/home/ubuntu/losses_accuracies')

# Save training loss graph

plt.plot([i for i in range(len(train_losses))] ,train_losses, 'b')
plt.savefig('/home/ubuntu/training_loss_graph.png')
plt.close()
print('Done!')

# Save training acc graph

plt.plot([i for i in range(len(train_accs))] ,train_accs, 'b')
plt.savefig('/home/ubuntu/training_acc_graph.png')
plt.close()
print('Done!')

# Save validation loss graph

plt.plot([i for i in range(len(val_losses))] ,val_losses, 'b')
plt.savefig('/home/ubuntu/validation_loss_graph.png')
plt.close()
print('Done!')

# Save validation acc graph

plt.plot([i for i in range(len(val_accs))] ,val_accs, 'b')
plt.savefig('/home/ubuntu/validation_acc_graph.png')
plt.close()
print('Done!')

# Eval

In [None]:
!sudo pip install huggingface_hub

In [None]:
from huggingface_hub import hf_hub_download

hf_hub_download(repo_id='asigalov61/Orpheus-Music-Transformer',
 filename='Orpheus_Music_Transformer_Trained_Model_96332_steps_0.82_loss_0.748_acc.pth',
 local_dir='/home/ubuntu/Models/',
 )

In [None]:
SEQ_LEN = 8192
PAD_IDX = 18819

model = TransformerWrapper(
 num_tokens = PAD_IDX+1,
 max_seq_len = SEQ_LEN,
 attn_layers = Decoder(dim = 2048,
 depth = 8,
 heads = 32,
 rotary_pos_emb = True,
 attn_flash = True
 )
 )

model = AutoregressiveWrapper(model, ignore_index = PAD_IDX, pad_value=PAD_IDX)

print('=' * 70)
print('Loading model checkpoint...')

model_path = 'Models/Orpheus_Music_Transformer_Trained_Model_96332_steps_0.82_loss_0.748_acc.pth'

model.load_state_dict(torch.load(model_path))

print('=' * 70)

model.cuda()
model.eval()

print('Done!')

summary(model)

dtype = torch.bfloat16

ctx = torch.amp.autocast(device_type='cuda', dtype=dtype)

In [None]:
midi_file = 'Orpheus-Music-Transformer-Piano-Seed-1.mid'

print('=' * 70)
print('MIDI File:', midi_file)
print('=' * 70)

raw_score = TMIDIX.midi2single_track_ms_score(midi_file)

escore_notes = TMIDIX.advanced_score_processor(raw_score, return_enhanced_score_notes=True, apply_sustain=True)

escore_notes = TMIDIX.augment_enhanced_score_notes(escore_notes[0], sort_drums_last=True)

dscore = TMIDIX.delta_score_notes(escore_notes)

dcscore = TMIDIX.chordify_score([d[1:] for d in dscore])

bad_chords_counts = TMIDIX.count_bad_chords_in_chordified_score(dcscore, pitches_index=3, patches_index=5)

melody_chords = [18816]

#=======================================================
# MAIN PROCESSING CYCLE
#=======================================================

for i, c in enumerate(dcscore):

 # Outro seq
 # if len(dcscore)-i == 64 and len(dcscore) > 191:
 # melody_chords.extend([18817])
 
 # Delta start-times

 delta_time = c[0][0]

 melody_chords.append(delta_time)

 for e in c:
 
 #=======================================================
 
 # Durations
 dur = max(1, min(255, e[1]))

 # Patches
 pat = max(0, min(128, e[5]))
 
 # Pitches
 ptc = max(1, min(127, e[3]))
 
 # Velocities
 # Calculating octo-velocity
 
 vel = max(8, min(127, e[4]))
 velocity = round(vel / 15)-1
 
 #=======================================================
 # FINAL NOTE SEQ
 #=======================================================
 
 # Writing final note
 pat_ptc = (128 * pat) + ptc 
 dur_vel = (8 * dur) + velocity

 melody_chords.extend([pat_ptc+256, dur_vel+16768]) # 18816

print('Done!')
print('=' * 70)
print(len(melody_chords))
print('=' * 70)

In [None]:
model.eval()

x = torch.LongTensor([0]).cuda()
# x = torch.LongTensor(melody_chords).cuda()

with ctx:
 out = model.generate(x,
 700,
 temperature=0.9,
 #filter_logits_fn=top_k,
 #filter_kwargs={'k': 15},
 return_prime=True,
 verbose=True)

y = out.tolist()

print('---------------')

In [None]:
# Save to MIDI

data = y[0]

print('Sample INTs', data[:15])

if len(data) != 0:

 song = data
 song_f = []

 time = 0
 dur = 1
 vel = 90
 pitch = 60
 channel = 0
 patch = 0

 patches = [-1] * 16

 channels = [0] * 16
 channels[9] = 1

 for ss in song:

 if 0 <= ss < 256:

 time += ss * 16

 if 256 <= ss < 16768:

 patch = (ss-256) // 128

 if patch < 128:

 if patch not in patches:
 if 0 in channels:
 cha = channels.index(0)
 channels[cha] = 1
 else:
 cha = 15

 patches[cha] = patch
 channel = patches.index(patch)
 else:
 channel = patches.index(patch)

 if patch == 128:
 channel = 9

 pitch = (ss-256) % 128


 if 16768 <= ss < 18816:

 dur = ((ss-16768) // 8) * 16
 vel = (((ss-16768) % 8)+1) * 15

 song_f.append(['note', time, dur, channel, pitch, vel ])

patches = [0 if x==-1 else x for x in patches]

detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,
 output_signature = 'Orpheus Music Transformer',
 output_file_name = '/home/ubuntu/Orpheus-Music-Transformer-Composition',
 track_name='Project Los Angeles',
 list_of_MIDI_patches=patches
 )

print('Done!')

In [None]:
tok_emb = model.net.token_emb.emb.weight.detach().cpu().tolist()

cos_sim = metrics.pairwise_distances(
 tok_emb, metric='cosine'
)
plt.figure(figsize=(7, 7))
plt.imshow(cos_sim, cmap="inferno", interpolation="nearest")
im_ratio = cos_sim.shape[0] / cos_sim.shape[1]
plt.colorbar(fraction=0.046 * im_ratio, pad=0.04)
plt.xlabel("Position")
plt.ylabel("Position")
plt.tight_layout()
plt.plot()
plt.savefig("/home/ubuntu/Orpheus-Music-Transformer-Tokens-Embeddings-Plot.png", bbox_inches="tight")

# Congrats! You did it! :)