ACE-Step

Running

App Files Files Community

Sayoyo commited on 9 days ago

Commit

566c01f

1 Parent(s): 4617cbd

[feat] add extend

Browse files

Files changed (2) hide show

pipeline_ace_step.py +100 -16
ui/components.py +120 -5

pipeline_ace_step.py CHANGED Viewed

@@ -595,23 +595,83 @@ class ACEStepPipeline:
         target_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=random_generators, device=device, dtype=dtype)
         is_repaint = False
         if add_retake_noise:
             retake_variance = torch.tensor(retake_variance * math.pi/2).to(device).to(dtype)
             retake_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=retake_random_generators, device=device, dtype=dtype)
             repaint_start_frame = int(repaint_start * 44100 / 512 / 8)
             repaint_end_frame = int(repaint_end * 44100 / 512 / 8)
             # retake
-            is_repaint = repaint_end_frame - repaint_start_frame != frame_length
             # to make sure mean = 0, std = 1
             if not is_repaint:
                 target_latents = torch.cos(retake_variance) * target_latents + torch.sin(retake_variance) * retake_latents
-            else:
                 repaint_mask = torch.zeros((bsz, 8, 16, frame_length), device=device, dtype=dtype)
                 repaint_mask[:, :, :, repaint_start_frame:repaint_end_frame] = 1.0
                 repaint_noise = torch.cos(retake_variance) * target_latents + torch.sin(retake_variance) * retake_latents
                 repaint_noise = torch.where(repaint_mask == 1.0, repaint_noise, target_latents)
                 z0 = repaint_noise
         attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype)
@@ -716,6 +776,16 @@ class ACEStepPipeline:
             return sample
         for i, t in tqdm(enumerate(timesteps), total=num_inference_steps):
             # expand the latents if we are doing classifier free guidance
             latents = target_latents
@@ -818,14 +888,27 @@ class ACEStepPipeline:
                     timestep=timestep,
                 ).sample
-            target_latents = scheduler.step(model_output=noise_pred, timestep=t, sample=target_latents, return_dict=False, omega=omega_scale)[0]
-            if is_repaint:
-                t_i = t / 1000
-                x0 = src_latents
-                xt = (1 - t_i) * x0 + t_i * z0
-                target_latents = torch.where(repaint_mask == 1.0, target_latents, xt)
         return target_latents
     def latents2audio(self, latents, target_wav_duration_second=30, sample_rate=48000, save_path=None, format="flac"):
@@ -899,6 +982,7 @@ class ACEStepPipeline:
         save_path: str = None,
         format: str = "flac",
         batch_size: int = 1,
     ):
         start_time = time.time()
@@ -936,7 +1020,7 @@ class ACEStepPipeline:
         lyric_token_idx = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
         lyric_mask = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
         if len(lyrics) > 0:
-            lyric_token_idx = self.tokenize_lyrics(lyrics, debug=True)
             lyric_mask = [1] * len(lyric_token_idx)
             lyric_token_idx = torch.tensor(lyric_token_idx).unsqueeze(0).to(self.device).repeat(batch_size, 1)
             lyric_mask = torch.tensor(lyric_mask).unsqueeze(0).to(self.device).repeat(batch_size, 1)
@@ -949,7 +1033,7 @@ class ACEStepPipeline:
         preprocess_time_cost = end_time - start_time
         start_time = end_time
-        add_retake_noise = task in ("retake", "repaint")
         # retake equal to repaint
         if task == "retake":
             repaint_start = 0
@@ -957,7 +1041,7 @@ class ACEStepPipeline:
         src_latents = None
         if src_audio_path is not None:
-            assert src_audio_path is not None and task in ("repaint", "edit"), "src_audio_path is required for repaint task"
             assert os.path.exists(src_audio_path), f"src_audio_path {src_audio_path} does not exist"
             src_latents = self.infer_latents(src_audio_path)
@@ -989,7 +1073,7 @@ class ACEStepPipeline:
                 target_lyric_token_ids=target_lyric_token_idx,
                 target_lyric_mask=target_lyric_mask,
                 src_latents=src_latents,
-                random_generators=random_generators,
                 infer_steps=infer_step,
                 guidance_scale=guidance_scale,
                 n_min=edit_n_min,
@@ -1048,8 +1132,8 @@ class ACEStepPipeline:
         input_params_json = {
             "task": task,
-            "prompt": prompt,
-            "lyrics": lyrics,
             "audio_duration": audio_duration,
             "infer_step": infer_step,
             "guidance_scale": guidance_scale,

         target_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=random_generators, device=device, dtype=dtype)
         is_repaint = False
+        is_extend  = False
         if add_retake_noise:
+            n_min = int(infer_steps * (1 - retake_variance))
             retake_variance = torch.tensor(retake_variance * math.pi/2).to(device).to(dtype)
             retake_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=retake_random_generators, device=device, dtype=dtype)
             repaint_start_frame = int(repaint_start * 44100 / 512 / 8)
             repaint_end_frame = int(repaint_end * 44100 / 512 / 8)
+            x0 = src_latents
             # retake
+            is_repaint = (repaint_end_frame - repaint_start_frame != frame_length)
+            is_extend = (repaint_start_frame < 0) or (repaint_end_frame > frame_length)
+            if is_extend:
+                is_repaint = True
+            # TODO: train a mask aware repainting controlnet
             # to make sure mean = 0, std = 1
             if not is_repaint:
                 target_latents = torch.cos(retake_variance) * target_latents + torch.sin(retake_variance) * retake_latents
+            elif not is_extend:
+                # if repaint_end_frame
                 repaint_mask = torch.zeros((bsz, 8, 16, frame_length), device=device, dtype=dtype)
                 repaint_mask[:, :, :, repaint_start_frame:repaint_end_frame] = 1.0
                 repaint_noise = torch.cos(retake_variance) * target_latents + torch.sin(retake_variance) * retake_latents
                 repaint_noise = torch.where(repaint_mask == 1.0, repaint_noise, target_latents)
                 z0 = repaint_noise
+            elif is_extend:
+                to_right_pad_gt_latents = None
+                to_left_pad_gt_latents = None
+                gt_latents = src_latents
+                src_latents_length = gt_latents.shape[-1]
+                max_infer_fame_length = int(240 * 44100 / 512 / 8)
+                left_pad_frame_length = 0
+                right_pad_frame_length = 0
+                right_trim_length = 0
+                left_trim_length = 0
+                if repaint_start_frame < 0:
+                    left_pad_frame_length = abs(repaint_start_frame)
+                    frame_length = left_pad_frame_length + gt_latents.shape[-1]
+                    extend_gt_latents = torch.nn.functional.pad(gt_latents, (left_pad_frame_length, 0), "constant", 0)
+                    if frame_length > max_infer_fame_length:
+                        right_trim_length = frame_length - max_infer_fame_length
+                        extend_gt_latents = extend_gt_latents[:,:,:,:max_infer_fame_length]
+                        to_right_pad_gt_latents = extend_gt_latents[:,:,:,-right_trim_length:]
+                        frame_length = max_infer_fame_length
+                    repaint_start_frame = 0
+                    gt_latents = extend_gt_latents
+                if repaint_end_frame > src_latents_length:
+                    right_pad_frame_length = repaint_end_frame - gt_latents.shape[-1]
+                    frame_length = gt_latents.shape[-1] + right_pad_frame_length
+                    extend_gt_latents = torch.nn.functional.pad(gt_latents, (0, right_pad_frame_length), "constant", 0)
+                    if frame_length > max_infer_fame_length:
+                        left_trim_length = frame_length - max_infer_fame_length
+                        extend_gt_latents = extend_gt_latents[:,:,:,-max_infer_fame_length:]
+                        to_left_pad_gt_latents = extend_gt_latents[:,:,:,:left_trim_length]
+                        frame_length = max_infer_fame_length
+                    repaint_end_frame = frame_length
+                    gt_latents = extend_gt_latents
+                repaint_mask = torch.zeros((bsz, 8, 16, frame_length), device=device, dtype=dtype)
+                if left_pad_frame_length > 0:
+                    repaint_mask[:,:,:,:left_pad_frame_length] = 1.0
+                if right_pad_frame_length > 0:
+                    repaint_mask[:,:,:,-right_pad_frame_length:] = 1.0
+                x0 = gt_latents
+                padd_list = []
+                if left_pad_frame_length > 0:
+                    padd_list.append(retake_latents[:, :, :, :left_pad_frame_length])
+                padd_list.append(target_latents[:,:,:,left_trim_length:target_latents.shape[-1]-right_trim_length])
+                if right_pad_frame_length > 0:
+                    padd_list.append(retake_latents[:, :, :, -right_pad_frame_length:])
+                target_latents = torch.cat(padd_list, dim=-1)
+                assert target_latents.shape[-1] == x0.shape[-1], f"{target_latents.shape=} {x0.shape=}"
+            zt_edit = x0.clone()
+            z0 = target_latents
         attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype)
             return sample
         for i, t in tqdm(enumerate(timesteps), total=num_inference_steps):
+            if is_repaint:
+                if i < n_min:
+                    continue
+                elif i == n_min:
+                    t_i = t / 1000
+                    zt_src = (1 - t_i) * x0 + (t_i) * z0
+                    target_latents = zt_edit + zt_src - x0
+                    logger.info(f"repaint start from {n_min} add {t_i} level of noise")
             # expand the latents if we are doing classifier free guidance
             latents = target_latents
                     timestep=timestep,
                 ).sample
+            if is_repaint and i >= n_min:
+                t_i = t/1000
+                if i+1 < len(timesteps):
+                    t_im1 = (timesteps[i+1])/1000
+                else:
+                    t_im1 = torch.zeros_like(t_i).to(t_i.device)
+                dtype = noise_pred.dtype
+                target_latents = target_latents.to(torch.float32)
+                prev_sample = target_latents + (t_im1 - t_i) * noise_pred
+                prev_sample = prev_sample.to(dtype)
+                target_latents = prev_sample
+                zt_src = (1 - t_im1) * x0 + (t_im1) * z0
+                target_latents = torch.where(repaint_mask == 1.0, target_latents, zt_src)
+            else:
+                target_latents = scheduler.step(model_output=noise_pred, timestep=t, sample=target_latents, return_dict=False, omega=omega_scale)[0]
+        if is_extend:
+            if to_right_pad_gt_latents is not None:
+                target_latents = torch.cate([target_latents, to_right_pad_gt_latents], dim=-1)
+            if to_left_pad_gt_latents is not None:
+                target_latents = torch.cate([to_right_pad_gt_latents, target_latents], dim=0)
         return target_latents
     def latents2audio(self, latents, target_wav_duration_second=30, sample_rate=48000, save_path=None, format="flac"):
         save_path: str = None,
         format: str = "flac",
         batch_size: int = 1,
+        debug: bool = False,
     ):
         start_time = time.time()
         lyric_token_idx = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
         lyric_mask = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
         if len(lyrics) > 0:
+            lyric_token_idx = self.tokenize_lyrics(lyrics, debug=debug)
             lyric_mask = [1] * len(lyric_token_idx)
             lyric_token_idx = torch.tensor(lyric_token_idx).unsqueeze(0).to(self.device).repeat(batch_size, 1)
             lyric_mask = torch.tensor(lyric_mask).unsqueeze(0).to(self.device).repeat(batch_size, 1)
         preprocess_time_cost = end_time - start_time
         start_time = end_time
+        add_retake_noise = task in ("retake", "repaint", "extend")
         # retake equal to repaint
         if task == "retake":
             repaint_start = 0
         src_latents = None
         if src_audio_path is not None:
+            assert src_audio_path is not None and task in ("repaint", "edit", "extend"), "src_audio_path is required for retake/repaint/extend task"
             assert os.path.exists(src_audio_path), f"src_audio_path {src_audio_path} does not exist"
             src_latents = self.infer_latents(src_audio_path)
                 target_lyric_token_ids=target_lyric_token_idx,
                 target_lyric_mask=target_lyric_mask,
                 src_latents=src_latents,
+                random_generators=retake_random_generators, # more diversity
                 infer_steps=infer_step,
                 guidance_scale=guidance_scale,
                 n_min=edit_n_min,
         input_params_json = {
             "task": task,
+            "prompt": prompt if task != "edit" else edit_target_prompt,
+            "lyrics": lyrics if task != "edit" else edit_target_lyrics,
             "audio_duration": audio_duration,
             "infer_step": infer_step,
             "guidance_scale": guidance_scale,

ui/components.py CHANGED Viewed

@@ -65,7 +65,7 @@ def create_text2music_ui(
         with gr.Column():
             with gr.Row(equal_height=True):
                 # add markdown, tags and lyrics examples are from ai music generation community
-                audio_duration = gr.Slider(-1, 240.0, step=0.00001, value=180, label="Audio Duration", interactive=True, info="-1 means random duration (30 ~ 240).", scale=9)
                 sample_bnt = gr.Button("Sample", variant="primary", scale=1)
             prompt = gr.Textbox(lines=2, label="Tags", max_lines=4, placeholder=TAG_PLACEHOLDER, info="Support tags, descriptions, and scene. Use commas to separate different tags.\ntags and lyrics examples are from ai music generation community")
@@ -252,14 +252,15 @@ def create_text2music_ui(
             with gr.Tab("edit"):
                 edit_prompt = gr.Textbox(lines=2, label="Edit Tags", max_lines=4)
                 edit_lyrics = gr.Textbox(lines=9, label="Edit Lyrics", max_lines=13)
                 edit_type = gr.Radio(["only_lyrics", "remix"], value="only_lyrics", label="Edit Type", elem_id="edit_type", info="`only_lyrics` will keep the whole song the same except lyrics difference. Make your diffrence smaller, e.g. one lyrc line change.\nremix can change the song melody and genre")
-                edit_n_min = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.8, label="edit_n_min", interactive=True)
                 edit_n_max = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="edit_n_max", interactive=True)
                 def edit_type_change_func(edit_type):
                     if edit_type == "only_lyrics":
-                        n_min = 0.8
                         n_max = 1.0
                     elif edit_type == "remix":
                         n_min = 0.2
@@ -309,6 +310,7 @@ def create_text2music_ui(
                     oss_steps,
                     guidance_scale_text,
                     guidance_scale_lyric,
                 ):
                     if edit_source == "upload":
                         src_audio_path = edit_source_audio_upload
@@ -349,7 +351,8 @@ def create_text2music_ui(
                         edit_target_prompt=edit_prompt,
                         edit_target_lyrics=edit_lyrics,
                         edit_n_min=edit_n_min,
-                        edit_n_max=edit_n_max
                     )
                 edit_bnt.click(
@@ -380,9 +383,121 @@ def create_text2music_ui(
                         oss_steps,
                         guidance_scale_text,
                         guidance_scale_lyric,
                     ],
                     outputs=edit_outputs + [edit_input_params_json],
                 )
         def sample_data():
             json_data = sample_data_func()

         with gr.Column():
             with gr.Row(equal_height=True):
                 # add markdown, tags and lyrics examples are from ai music generation community
+                audio_duration = gr.Slider(-1, 240.0, step=0.00001, value=-1, label="Audio Duration", interactive=True, info="-1 means random duration (30 ~ 240).", scale=9)
                 sample_bnt = gr.Button("Sample", variant="primary", scale=1)
             prompt = gr.Textbox(lines=2, label="Tags", max_lines=4, placeholder=TAG_PLACEHOLDER, info="Support tags, descriptions, and scene. Use commas to separate different tags.\ntags and lyrics examples are from ai music generation community")
             with gr.Tab("edit"):
                 edit_prompt = gr.Textbox(lines=2, label="Edit Tags", max_lines=4)
                 edit_lyrics = gr.Textbox(lines=9, label="Edit Lyrics", max_lines=13)
+                retake_seeds = gr.Textbox(label="edit seeds (default None)", placeholder="", value=None)
                 edit_type = gr.Radio(["only_lyrics", "remix"], value="only_lyrics", label="Edit Type", elem_id="edit_type", info="`only_lyrics` will keep the whole song the same except lyrics difference. Make your diffrence smaller, e.g. one lyrc line change.\nremix can change the song melody and genre")
+                edit_n_min = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.6, label="edit_n_min", interactive=True)
                 edit_n_max = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="edit_n_max", interactive=True)
                 def edit_type_change_func(edit_type):
                     if edit_type == "only_lyrics":
+                        n_min = 0.6
                         n_max = 1.0
                     elif edit_type == "remix":
                         n_min = 0.2
                     oss_steps,
                     guidance_scale_text,
                     guidance_scale_lyric,
+                    retake_seeds,
                 ):
                     if edit_source == "upload":
                         src_audio_path = edit_source_audio_upload
                         edit_target_prompt=edit_prompt,
                         edit_target_lyrics=edit_lyrics,
                         edit_n_min=edit_n_min,
+                        edit_n_max=edit_n_max,
+                        retake_seeds=retake_seeds,
                     )
                 edit_bnt.click(
                         oss_steps,
                         guidance_scale_text,
                         guidance_scale_lyric,
+                        retake_seeds,
                     ],
                     outputs=edit_outputs + [edit_input_params_json],
                 )
+            with gr.Tab("extend"):
+                extend_seeds = gr.Textbox(label="extend seeds (default None)", placeholder="", value=None)
+                left_extend_length = gr.Slider(minimum=0.0, maximum=240.0, step=0.01, value=0.0, label="Left Extend Length", interactive=True)
+                right_extend_length = gr.Slider(minimum=0.0, maximum=240.0, step=0.01, value=30.0, label="Right Extend Length", interactive=True)
+                extend_source = gr.Radio(["text2music", "last_extend", "upload"], value="text2music", label="Extend Source", elem_id="extend_source")
+                extend_source_audio_upload = gr.Audio(label="Upload Audio", type="filepath", visible=False, elem_id="extend_source_audio_upload")
+                extend_source.change(
+                    fn=lambda x: gr.update(visible=x == "upload", elem_id="extend_source_audio_upload"),
+                    inputs=[extend_source],
+                    outputs=[extend_source_audio_upload],
+                )
+                extend_bnt = gr.Button("Extend", variant="primary")
+                extend_outputs, extend_input_params_json = create_output_ui("Extend")
+                def extend_process_func(
+                    text2music_json_data,
+                    extend_input_params_json,
+                    extend_seeds,
+                    left_extend_length,
+                    right_extend_length,
+                    extend_source,
+                    extend_source_audio_upload,
+                    prompt,
+                    lyrics,
+                    infer_step,
+                    guidance_scale,
+                    scheduler_type,
+                    cfg_type,
+                    omega_scale,
+                    manual_seeds,
+                    guidance_interval,
+                    guidance_interval_decay,
+                    min_guidance_scale,
+                    use_erg_tag,
+                    use_erg_lyric,
+                    use_erg_diffusion,
+                    oss_steps,
+                    guidance_scale_text,
+                    guidance_scale_lyric,
+                ):
+                    if extend_source == "upload":
+                        src_audio_path = extend_source_audio_upload
+                        json_data = text2music_json_data
+                    elif extend_source == "text2music":
+                        json_data = text2music_json_data
+                        src_audio_path = json_data["audio_path"]
+                    elif extend_source == "last_repaint":
+                        json_data = extend_input_params_json
+                        src_audio_path = json_data["audio_path"]
+                    repaint_start = -left_extend_length
+                    repaint_end = json_data["audio_duration"] + right_extend_length
+                    return text2music_process_func(
+                        json_data["audio_duration"],
+                        prompt,
+                        lyrics,
+                        infer_step,
+                        guidance_scale,
+                        scheduler_type,
+                        cfg_type,
+                        omega_scale,
+                        manual_seeds,
+                        guidance_interval,
+                        guidance_interval_decay,
+                        min_guidance_scale,
+                        use_erg_tag,
+                        use_erg_lyric,
+                        use_erg_diffusion,
+                        oss_steps,
+                        guidance_scale_text,
+                        guidance_scale_lyric,
+                        retake_seeds=extend_seeds,
+                        retake_variance=1.0,
+                        task="extend",
+                        repaint_start=repaint_start,
+                        repaint_end=repaint_end,
+                        src_audio_path=src_audio_path,
+                    )
+                extend_bnt.click(
+                    fn=extend_process_func,
+                    inputs=[
+                        input_params_json,
+                        extend_input_params_json,
+                        extend_seeds,
+                        left_extend_length,
+                        right_extend_length,
+                        extend_source,
+                        extend_source_audio_upload,
+                        prompt,
+                        lyrics,
+                        infer_step,
+                        guidance_scale,
+                        scheduler_type,
+                        cfg_type,
+                        omega_scale,
+                        manual_seeds,
+                        guidance_interval,
+                        guidance_interval_decay,
+                        min_guidance_scale,
+                        use_erg_tag,
+                        use_erg_lyric,
+                        use_erg_diffusion,
+                        oss_steps,
+                        guidance_scale_text,
+                        guidance_scale_lyric,
+                    ],
+                    outputs=extend_outputs + [extend_input_params_json],
+                )
         def sample_data():
             json_data = sample_data_func()