Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on about 12 hours ago

Commit

07e6e48

1 Parent(s): 18ea198

Extra mode

Browse files

Files changed (5) hide show

app.py +42 -8
diffrhythm/infer/infer.py +5 -4
diffrhythm/infer/infer_utils.py +19 -7
diffrhythm/model/cfm.py +7 -17
diffrhythm/model/dit.py +1 -17

app.py CHANGED Viewed

@@ -18,7 +18,8 @@ import base64
 from diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_lrc_token,
-    get_style_prompt,
     prepare_model,
     get_negative_style_prompt
 )
@@ -29,16 +30,19 @@ device='cuda'
 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
-@spaces.GPU(duration=20)
-def infer_music(lrc, ref_audio_path, seed=42, randomize_seed=False, steps=32, file_type='wav', max_frames=2048, device='cuda'):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     torch.manual_seed(seed)
     sway_sampling_coef = -1 if steps < 32 else None
     try:
         lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
-        style_prompt = get_style_prompt(muq, ref_audio_path)
     except Exception as e:
         raise gr.Error(f"Error: {str(e)}")
     negative_style_prompt = get_negative_style_prompt(device)
@@ -53,7 +57,8 @@ def infer_music(lrc, ref_audio_path, seed=42, randomize_seed=False, steps=32, fi
                                steps=steps,
                                sway_sampling_coef=sway_sampling_coef,
                                start_time=start_time,
-                               file_type=file_type
                                )
     return generated_song
@@ -179,7 +184,23 @@ with gr.Blocks(css=css) as demo:
                         elem_classes="lyrics-scroll-box",
                         value="""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""
                     )
-                    audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./src/prompt/default.wav")
                 with gr.Column():
                     with gr.Accordion("Best Practices Guide", open=True):
@@ -218,7 +239,7 @@ with gr.Blocks(css=css) as demo:
                         steps = gr.Slider(
                                     minimum=10,
                                     maximum=100,
-                                    value=32,
                                     step=1,
                                     label="Diffusion Steps",
                                     interactive=True,
@@ -248,6 +269,19 @@ with gr.Blocks(css=css) as demo:
                 examples_per_page=13,
                 elem_id="audio-examples-container"
             )
             gr.Examples(
                 examples=[
@@ -352,7 +386,7 @@ with gr.Blocks(css=css) as demo:
     lyrics_btn.click(
         fn=infer_music,
-        inputs=[lrc, audio_prompt, seed, randomize_seed, steps, file_type],
         outputs=audio_output
     )

 from diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_lrc_token,
+    get_audio_style_prompt,
+    get_text_style_prompt,
     prepare_model,
     get_negative_style_prompt
 )
 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
+def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42, randomize_seed=False, steps=32, file_type='wav', max_frames=2048, device='cuda'):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     torch.manual_seed(seed)
     sway_sampling_coef = -1 if steps < 32 else None
+    vocal_flag = False
     try:
         lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
+        if current_prompt_type == 'audio':
+            style_prompt, vocal_flag = get_audio_style_prompt(muq, ref_audio_path)
+        else:
+            style_prompt = get_text_style_prompt(muq, text_prompt)
     except Exception as e:
         raise gr.Error(f"Error: {str(e)}")
     negative_style_prompt = get_negative_style_prompt(device)
                                steps=steps,
                                sway_sampling_coef=sway_sampling_coef,
                                start_time=start_time,
+                               file_type=file_type,
+                               vocal_flag=vocal_flag
                                )
     return generated_song
                         elem_classes="lyrics-scroll-box",
                         value="""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""
                     )
+                    current_prompt_type = gr.State(value="audio")
+                    with gr.Tabs() as inside_tabs:
+                        with gr.Tab("Audio Prompt"):
+                            audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./src/prompt/default.wav")
+                        with gr.Tab("Text Prompt"):
+                            text_prompt = gr.Textbox(
+                            label="Text Prompt",
+                            placeholder="Enter the Text Prompt, eg: emotional piano pop",
+                            )
+                        def update_prompt_type(evt: gr.SelectData):
+                            return "audio" if evt.index == 0 else "text"
+                        inside_tabs.select(
+                            fn=update_prompt_type,
+                            outputs=current_prompt_type
+                        )
                 with gr.Column():
                     with gr.Accordion("Best Practices Guide", open=True):
                         steps = gr.Slider(
                                     minimum=10,
                                     maximum=100,
+                                    value=32,
                                     step=1,
                                     label="Diffusion Steps",
                                     interactive=True,
                 examples_per_page=13,
                 elem_id="audio-examples-container"
             )
+            gr.Examples(
+                examples=[
+                    ["Pop Emotional Piano"],
+                    ["流行 情感 钢琴"],
+                    ["Indie folk ballad, coming-of-age themes, acoustic guitar picking with harmonica interludes"],
+                    ["独立民谣, 成长主题, 原声吉他弹奏与口琴间奏"]
+                ],
+                inputs=[text_prompt],
+                label="Text Examples",
+                examples_per_page=4,
+                elem_id="text-examples-container"
+            )
             gr.Examples(
                 examples=[
     lyrics_btn.click(
         fn=infer_music,
+        inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, file_type],
         outputs=audio_output
     )

diffrhythm/infer/infer.py CHANGED Viewed

@@ -14,7 +14,7 @@ import pydub
 from diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_lrc_token,
-    get_style_prompt,
     prepare_model,
     get_negative_style_prompt
 )
@@ -74,7 +74,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
-def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time, file_type):
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
@@ -86,7 +86,8 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
             steps=steps,
             cfg_strength=4.0,
             sway_sampling_coef=sway_sampling_coef,
-            start_time=start_time
         )
         generated = generated.to(torch.float32)
@@ -133,7 +134,7 @@ if __name__ == "__main__":
         lrc = f.read()
     lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
-    style_prompt = get_style_prompt(muq, args.ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)

 from diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_lrc_token,
+    get_audio_style_prompt,
     prepare_model,
     get_negative_style_prompt
 )
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
+def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time, file_type, vocal_flag):
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
             steps=steps,
             cfg_strength=4.0,
             sway_sampling_coef=sway_sampling_coef,
+            start_time=start_time,
+            vocal_flag=vocal_flag,
         )
         generated = generated.to(torch.float32)
         lrc = f.read()
     lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
+    style_prompt = get_audio_style_prompt(muq, args.ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)

diffrhythm/infer/infer_utils.py CHANGED Viewed

@@ -51,13 +51,14 @@ def get_negative_style_prompt(device):
     return vocal_stlye
-def get_style_prompt(model, wav_path):
     mulan = model
     audio, _ = librosa.load(wav_path, sr=24000)
     audio_len = librosa.get_duration(y=audio, sr=24000)
-    assert audio_len >= 1, "Input audio length shorter than 1 second"
     if audio_len > 10:
         start_time = int(audio_len // 2 - 5)
@@ -70,10 +71,20 @@ def get_style_prompt(model, wav_path):
     with torch.no_grad():
         audio_emb = mulan(wavs = wav) # [1, 512]
-    audio_emb = audio_emb
     audio_emb = audio_emb.half()
-    return audio_emb
 def parse_lyrics(lyrics: str):
     lyrics_with_time = []
@@ -94,7 +105,6 @@ class CNENTokenizer():
         with open('./diffrhythm/g2p/g2p/vocab.json', 'r') as file:
             self.phone2id:dict = json.load(file)['vocab']
         self.id2phone = {v:k for (k, v) in self.phone2id.items()}
-        # from f5_tts.g2p.g2p_generation import chn_eng_g2p
         from diffrhythm.g2p.g2p_generation import chn_eng_g2p
         self.tokenizer = chn_eng_g2p
     def encode(self, text):
@@ -115,6 +125,8 @@ def get_lrc_token(text, tokenizer, device):
     pad_token_id = 0
     comma_token_id = 1
     period_token_id = 2
     lrc_with_time = parse_lyrics(text)
@@ -146,7 +158,7 @@ def get_lrc_token(text, tokenizer, device):
         frame_start = max(gt_frame_start - frame_shift, last_end_pos)
         frame_len = min(num_tokens, max_frames - frame_start)
-        #print(gt_frame_start, frame_shift, frame_start, frame_len, tokens_count, last_end_pos, full_pos_emb.shape)
         lrc[frame_start:frame_start + frame_len] = tokens[:frame_len]

     return vocal_stlye
+def get_audio_style_prompt(model, wav_path):
+    vocal_flag = False
     mulan = model
     audio, _ = librosa.load(wav_path, sr=24000)
     audio_len = librosa.get_duration(y=audio, sr=24000)
+    if audio_len <= 1:
+        vocal_flag = True
     if audio_len > 10:
         start_time = int(audio_len // 2 - 5)
     with torch.no_grad():
         audio_emb = mulan(wavs = wav) # [1, 512]
     audio_emb = audio_emb.half()
+    return audio_emb, vocal_flag
+def get_text_style_prompt(model, text_prompt):
+    mulan = model
+    with torch.no_grad():
+        text_emb = mulan(texts = text_prompt) # [1, 512]
+    text_emb = text_emb.half()
+    return text_emb
 def parse_lyrics(lyrics: str):
     lyrics_with_time = []
         with open('./diffrhythm/g2p/g2p/vocab.json', 'r') as file:
             self.phone2id:dict = json.load(file)['vocab']
         self.id2phone = {v:k for (k, v) in self.phone2id.items()}
         from diffrhythm.g2p.g2p_generation import chn_eng_g2p
         self.tokenizer = chn_eng_g2p
     def encode(self, text):
     pad_token_id = 0
     comma_token_id = 1
     period_token_id = 2
+    if text == "":
+        return torch.zeros((max_frames,), dtype=torch.long).unsqueeze(0).to(device), torch.tensor(0.).unsqueeze(0).to(device).half()
     lrc_with_time = parse_lyrics(text)
         frame_start = max(gt_frame_start - frame_shift, last_end_pos)
         frame_len = min(num_tokens, max_frames - frame_start)
         lrc[frame_start:frame_start + frame_len] = tokens[:frame_len]

diffrhythm/model/cfm.py CHANGED Viewed

@@ -42,10 +42,7 @@ class CFM(nn.Module):
         transformer: nn.Module,
         sigma=0.0,
         odeint_kwargs: dict = dict(
-            # atol = 1e-5,
-            # rtol = 1e-5,
             method="euler" # 'midpoint'
-            # method="adaptive_heun"  # dopri5
         ),
         odeint_options: dict = dict(
             min_step=0.05
@@ -71,8 +68,6 @@ class CFM(nn.Module):
         self.style_drop_prob = style_drop_prob
         self.lrc_drop_prob = lrc_drop_prob
-        print(f"audio drop prob -> {self.audio_drop_prob}; style_drop_prob -> {self.style_drop_prob}; lrc_drop_prob: {self.lrc_drop_prob}")
         # transformer
         self.transformer = transformer
         dim = transformer.dim
@@ -83,7 +78,6 @@ class CFM(nn.Module):
         # sampling related
         self.odeint_kwargs = odeint_kwargs
-        # print(f"ODE SOLVER: {self.odeint_kwargs['method']}")
         self.odeint_options = odeint_options
@@ -120,6 +114,7 @@ class CFM(nn.Module):
         start_time=None,
         latent_pred_start_frame=0,
         latent_pred_end_frame=2048,
     ):
         self.eval()
@@ -151,10 +146,9 @@ class CFM(nn.Module):
         if exists(text):
             text_lens = (text != -1).sum(dim=-1)
-            #lens = torch.maximum(text_lens, lens)  # make sure lengths are at least those of the text characters
         # duration
-        # import pdb; pdb.set_trace()
         cond_mask = lens_to_mask(lens)
         if edit_mask is not None:
             cond_mask = cond_mask & edit_mask
@@ -170,7 +164,7 @@ class CFM(nn.Module):
         if isinstance(duration, int):
             duration = torch.full((batch,), duration, device=device, dtype=torch.long)
-        # duration = torch.maximum(lens + 1, duration)  # just add one token so something is generated
         duration = duration.clamp(max=max_duration)
         max_duration = duration.amax()
@@ -178,12 +172,6 @@ class CFM(nn.Module):
         if duplicate_test:
             test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
-        # cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0) # [b, t, d]
-        # cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False) # [b, max_duration]
-        # cond_mask = cond_mask.unsqueeze(-1) #[b, t, d]
-        # step_cond = torch.where(
-        #     cond_mask, cond, torch.zeros_like(cond)
-        # )  # allow direct control (cut cond audio) with lens passed in
         if batch > 1:
             mask = lens_to_mask(duration)
@@ -197,6 +185,10 @@ class CFM(nn.Module):
         start_time_embed, positive_text_embed, positive_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=False, start_time=start_time)
         _, negative_text_embed, negative_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=True, start_time=start_time)
         text_embed = torch.cat([positive_text_embed, negative_text_embed], 0)
         text_residuals = [torch.cat([a, b], 0) for a, b in zip(positive_text_residuals, negative_text_residuals)]
         step_cond = torch.cat([step_cond, step_cond], 0)
@@ -242,7 +234,6 @@ class CFM(nn.Module):
         sampled = trajectory[-1]
         out = sampled
-        # out = torch.where(cond_mask, cond, out)
         out = torch.where(fixed_span_mask, out, cond)
         if exists(vocoder):
@@ -286,7 +277,6 @@ class CFM(nn.Module):
         x0 = torch.randn_like(x1)
         # time step
-        # time = torch.rand((batch,), dtype=dtype, device=self.device)
         time = torch.normal(mean=0, std=1, size=(batch,), device=self.device)
         time = torch.nn.functional.sigmoid(time)
         # TODO. noise_scheduler

         transformer: nn.Module,
         sigma=0.0,
         odeint_kwargs: dict = dict(
             method="euler" # 'midpoint'
         ),
         odeint_options: dict = dict(
             min_step=0.05
         self.style_drop_prob = style_drop_prob
         self.lrc_drop_prob = lrc_drop_prob
         # transformer
         self.transformer = transformer
         dim = transformer.dim
         # sampling related
         self.odeint_kwargs = odeint_kwargs
         self.odeint_options = odeint_options
         start_time=None,
         latent_pred_start_frame=0,
         latent_pred_end_frame=2048,
+        vocal_flag=False
     ):
         self.eval()
         if exists(text):
             text_lens = (text != -1).sum(dim=-1)
         # duration
         cond_mask = lens_to_mask(lens)
         if edit_mask is not None:
             cond_mask = cond_mask & edit_mask
         if isinstance(duration, int):
             duration = torch.full((batch,), duration, device=device, dtype=torch.long)
         duration = duration.clamp(max=max_duration)
         max_duration = duration.amax()
         if duplicate_test:
             test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
         if batch > 1:
             mask = lens_to_mask(duration)
         start_time_embed, positive_text_embed, positive_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=False, start_time=start_time)
         _, negative_text_embed, negative_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=True, start_time=start_time)
+        if vocal_flag:
+            style_prompt = negative_style_prompt
+            negative_style_prompt = torch.zeros_like(style_prompt)
         text_embed = torch.cat([positive_text_embed, negative_text_embed], 0)
         text_residuals = [torch.cat([a, b], 0) for a, b in zip(positive_text_residuals, negative_text_residuals)]
         step_cond = torch.cat([step_cond, step_cond], 0)
         sampled = trajectory[-1]
         out = sampled
         out = torch.where(fixed_span_mask, out, cond)
         if exists(vocoder):
         x0 = torch.randn_like(x1)
         # time step
         time = torch.normal(mean=0, std=1, size=(batch,), device=self.device)
         time = torch.nn.functional.sigmoid(time)
         # TODO. noise_scheduler

diffrhythm/model/dit.py CHANGED Viewed

@@ -13,8 +13,6 @@ import torch
 from torch import nn
 import torch
 import torch.nn.functional as F
-from x_transformers.x_transformers import RotaryEmbedding
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRotaryEmbedding
 from transformers.models.llama import LlamaConfig
 from torch.utils.checkpoint import checkpoint
@@ -32,8 +30,6 @@ from diffrhythm.model.modules import (
 # apply_liger_kernel_to_llama()
 # Text embedding
 class TextEmbedding(nn.Module):
     def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
         super().__init__()
@@ -50,10 +46,7 @@ class TextEmbedding(nn.Module):
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
-        #text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
-        #text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
-        #text = F.pad(text, (0, seq_len - text_len), value=0)
         if drop_text:  # cfg for text
             text = torch.zeros_like(text)
@@ -75,8 +68,6 @@ class TextEmbedding(nn.Module):
 # noised input audio and context mixing embedding
 class InputEmbedding(nn.Module):
     def __init__(self, mel_dim, text_dim, out_dim, cond_dim):
         super().__init__()
@@ -89,7 +80,6 @@ class InputEmbedding(nn.Module):
         style_emb = style_emb.unsqueeze(1).repeat(1, x.shape[1], 1)
         time_emb = time_emb.unsqueeze(1).repeat(1, x.shape[1], 1)
-        # print(x.shape, cond.shape, text_embed.shape, style_emb.shape, time_emb.shape)
         x = self.proj(torch.cat((x, cond, text_embed, style_emb, time_emb), dim=-1))
         x = self.conv_pos_embed(x) + x
         return x
@@ -125,17 +115,13 @@ class DiT(nn.Module):
         self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim, cond_dim=cond_dim)
-        #self.rotary_embed = RotaryEmbedding(dim_head)
         self.dim = dim
         self.depth = depth
-        #self.transformer_blocks = nn.ModuleList(
-        #    [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout, use_style_prompt=use_style_prompt) for _ in range(depth)]
-        #)
         llama_config = LlamaConfig(hidden_size=dim, intermediate_size=dim * ff_mult, hidden_act='silu')
         llama_config._attn_implementation = 'sdpa'
-        #llama_config._attn_implementation = ''
         self.transformer_blocks = nn.ModuleList(
             [LlamaDecoderLayer(llama_config, layer_idx=i) for i in range(depth)]
         )
@@ -157,8 +143,6 @@ class DiT(nn.Module):
         self.norm_out = AdaLayerNormZero_Final(dim, cond_dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
-        # if use_style_prompt:
-        #     self.prompt_rnn = nn.LSTM(64, cond_dim, 1, batch_first=True)
     def forward_timestep_invariant(self, text, seq_len, drop_text, start_time):
         s_t = self.start_time_embed(start_time)

 from torch import nn
 import torch
 import torch.nn.functional as F
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRotaryEmbedding
 from transformers.models.llama import LlamaConfig
 from torch.utils.checkpoint import checkpoint
 # apply_liger_kernel_to_llama()
 # Text embedding
 class TextEmbedding(nn.Module):
     def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
         super().__init__()
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
         batch, text_len = text.shape[0], text.shape[1]
         if drop_text:  # cfg for text
             text = torch.zeros_like(text)
 # noised input audio and context mixing embedding
 class InputEmbedding(nn.Module):
     def __init__(self, mel_dim, text_dim, out_dim, cond_dim):
         super().__init__()
         style_emb = style_emb.unsqueeze(1).repeat(1, x.shape[1], 1)
         time_emb = time_emb.unsqueeze(1).repeat(1, x.shape[1], 1)
         x = self.proj(torch.cat((x, cond, text_embed, style_emb, time_emb), dim=-1))
         x = self.conv_pos_embed(x) + x
         return x
         self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim, cond_dim=cond_dim)
         self.dim = dim
         self.depth = depth
         llama_config = LlamaConfig(hidden_size=dim, intermediate_size=dim * ff_mult, hidden_act='silu')
         llama_config._attn_implementation = 'sdpa'
         self.transformer_blocks = nn.ModuleList(
             [LlamaDecoderLayer(llama_config, layer_idx=i) for i in range(depth)]
         )
         self.norm_out = AdaLayerNormZero_Final(dim, cond_dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
     def forward_timestep_invariant(self, text, seq_len, drop_text, start_time):
         s_t = self.start_time_embed(start_time)