Healthydater
/

musicgen-melody-large-endpoint

PyTorch

musicgen

Model card Files Files and versions Community

Phoenixak99 commited on Mar 17

Commit

90cb56c

verified ·

1 Parent(s): c7fca87

Update handler.py

Browse files

Files changed (1) hide show

handler.py +149 -166

handler.py CHANGED Viewed

@@ -75,11 +75,11 @@ class EndpointHandler:
             logger.info(f"Requested duration: {duration} seconds")
             # Generate audio
-            if duration <= self.max_segment_duration:  # For short durations, generate in one go
                 audio_output = self._generate_short_audio(prompt, duration, parameters)
             else:
-                # Use sliding window approach for longer durations
-                audio_output = self._generate_long_audio_sliding_window(prompt, duration, parameters)
             # Monitor GPU memory after generation
             allocated = torch.cuda.memory_allocated() / 1e9
@@ -137,8 +137,7 @@ class EndpointHandler:
             # Generate audio
             logger.info(f"Generation parameters: {generation_kwargs}")
-            with torch.inference_mode():
-                outputs = self.model.generate(**inputs, **generation_kwargs)
             # Return audio
             return outputs[0].cpu().numpy()
@@ -157,21 +156,20 @@ class EndpointHandler:
                 ).to("cuda")
                 # Generate with minimal parameters
-                with torch.inference_mode():
-                    outputs = self.model.generate(
-                        **inputs,
-                        max_new_tokens=max_new_tokens,
-                        do_sample=True,
-                        guidance_scale=1.0  # Minimal guidance
-                    )
                 return outputs[0].cpu().numpy()
             except Exception as e2:
                 logger.error(f"Second attempt failed: {e2}")
                 raise e2
-    def _equal_power_crossfade(self, segment1, segment2, overlap_samples):
-        """Apply an equal-power crossfade between segments for smooth transitions."""
         # Get the length of the segments
         length1 = segment1.shape[1]
         length2 = segment2.shape[1]
@@ -189,12 +187,11 @@ class EndpointHandler:
         # Copy the non-overlapping part of segment2
         result[:, length1:] = segment2[:, overlap_samples:]
-        # Apply equal-power crossfade to the overlapping parts
         if overlap_samples > 0:
-            # Equal power crossfade curves (cosine/sine based for smoother transitions)
-            t = np.linspace(0, np.pi/2, overlap_samples)
-            fade_out = np.cos(t)**2
-            fade_in = np.sin(t)**2
             # Get the overlapping parts
             segment1_end = segment1[:, -overlap_samples:].copy()
@@ -213,104 +210,80 @@ class EndpointHandler:
         return result
-    def _extract_style_keywords(self, prompt):
-        """Extract potential style-related keywords from the prompt to emphasize in continuations."""
-        # Common musical style keywords
-        style_keywords = [
-            "rock", "jazz", "classical", "pop", "electronic", "hip-hop", "rap", "country",
-            "folk", "blues", "metal", "ambient", "orchestral", "indie", "r&b", "soul",
-            "techno", "house", "drum and bass", "dubstep", "trance", "lo-fi", "lofi", "cinematic",
-            "soundtrack", "instrumental", "acoustic", "electric", "synth", "piano",
-            "guitar", "bass", "drums", "violin", "cello", "trumpet", "saxophone"
-        ]
-        # Extract any style keywords from the prompt
-        prompt_lower = prompt.lower()
-        found_keywords = []
-        for keyword in style_keywords:
-            if keyword in prompt_lower:
-                found_keywords.append(keyword)
-        # Return a string of found keywords or a default
-        if found_keywords:
-            return ", ".join(found_keywords)
-        else:
-            return "musical"
-    def _generate_long_audio_sliding_window(self, prompt, total_duration, params):
-        """
-        Generate long audio using Meta's sliding window approach:
-        - Generate 30-second chunks
-        - Slide window by 10 seconds
-        - Crossfade overlapping sections to maintain continuity
-        """
-        # Initialize variables
-        segment_duration = self.max_segment_duration  # 30 seconds per segment
-        slide_window = 10  # Slide by 10 seconds for each new segment
-        overlap_duration = segment_duration - slide_window  # 20 seconds of overlap
-        # Number of segments needed (rounding up)
-        num_segments = math.ceil((total_duration - overlap_duration) / slide_window) + 1
-        # Initialize audio array
-        final_audio = None
-        # Setup generation kwargs
-        generation_kwargs = {
-            "do_sample": True,
-            "guidance_scale": 3.0
-        }
-        # Add additional parameters if provided
-        if "top_k" in params:
-            generation_kwargs["top_k"] = min(int(params["top_k"]), 500)
-        if "temperature" in params:
-            temp = float(params["temperature"])
-            if temp > 0.1:
-                generation_kwargs["temperature"] = min(temp, 1.5)
-        else:
-            # A slightly lower temperature helps with style consistency
-            generation_kwargs["temperature"] = 0.95
-        if "guidance_scale" in params:
-            generation_kwargs["guidance_scale"] = min(float(params["guidance_scale"]), 3.0)
-        elif "cfg_coef" in params:
-            generation_kwargs["guidance_scale"] = min(float(params["cfg_coef"]), 3.0)
-        logger.info(f"Long audio generation using sliding window approach, {num_segments} segments")
-        logger.info(f"Generation parameters: {generation_kwargs}")
-        # Extract style keywords for better continuity
-        style_keywords = self._extract_style_keywords(prompt)
-        for i in range(num_segments):
             # Calculate segment duration
-            if i == 0:
-                # First segment is always the full segment duration
-                current_segment_duration = segment_duration
-            else:
-                # Calculate remaining duration (accounting for overlap)
-                remaining_duration = total_duration - (i * slide_window)
-                if remaining_duration <= 0:
-                    break
-                # Last segment might be shorter
-                current_segment_duration = min(segment_duration, remaining_duration + overlap_duration)
             try:
-                # Create enhanced prompt for better continuity
-                if i == 0:
                     segment_prompt = prompt
                 else:
-                    # Add continuation instructions with style keywords
-                    segment_prompt = f"{prompt} [continuation keeping same {style_keywords} style]"
-                logger.info(f"Generating segment {i+1}/{num_segments}, duration: {current_segment_duration:.1f}s")
-                logger.info(f"Segment prompt: {segment_prompt}")
-                # Process text input
                 inputs = self.processor(
                     text=[segment_prompt],
                     padding=True,
@@ -318,82 +291,92 @@ class EndpointHandler:
                 ).to("cuda")
                 # Calculate max_new_tokens from duration
-                max_new_tokens = int(current_segment_duration * 50)
-                generation_kwargs["max_new_tokens"] = max_new_tokens
-                # Generate segment
-                with torch.inference_mode():
-                    outputs = self.model.generate(**inputs, **generation_kwargs)
-                # Get the audio data
-                segment_audio = outputs[0].cpu().numpy()
-                # If this is the first segment, just keep it
-                if final_audio is None:
-                    final_audio = segment_audio
                 else:
-                    # For subsequent segments, we need to crossfade
-                    overlap_samples = int(overlap_duration * self.sampling_rate)
-                    crossfade_samples = int(3.0 * self.sampling_rate)  # 3-second crossfade
-                    # Ensure the segment is long enough for crossfading
-                    if segment_audio.shape[1] < overlap_samples:
-                        logger.warning(f"Segment {i+1} too short for proper crossfade, using concatenation")
-                        final_audio = np.concatenate([final_audio, segment_audio], axis=1)
-                    else:
-                        # Calculate where to crossfade
-                        current_length = final_audio.shape[1]
-                        segment_offset = current_length - overlap_samples
-                        # Create a new combined audio array with room for the new segment
-                        new_length = segment_offset + segment_audio.shape[1]
-                        combined_audio = np.zeros((final_audio.shape[0], new_length), dtype=final_audio.dtype)
-                        # Copy the existing audio
-                        combined_audio[:, :segment_offset] = final_audio[:, :segment_offset]
-                        # Crossfade the overlapping region
-                        crossfade_region = min(crossfade_samples, overlap_samples)
-                        # Calculate crossfade weights (equal power)
-                        t = np.linspace(0, np.pi/2, crossfade_region)
-                        fade_out = np.cos(t)**2
-                        fade_in = np.sin(t)**2
-                        # Apply crossfade at the transition point
-                        for ch in range(final_audio.shape[0]):
-                            # Crossfade
-                            combined_audio[ch, segment_offset:segment_offset+crossfade_region] = (
-                                final_audio[ch, segment_offset:segment_offset+crossfade_region] * fade_out +
-                                segment_audio[ch, :crossfade_region] * fade_in
-                            )
-                            # Copy the rest of the new segment (after crossfade)
-                            combined_audio[ch, segment_offset+crossfade_region:] = segment_audio[ch, crossfade_region:]
-                        final_audio = combined_audio
                 # Clear CUDA cache
                 torch.cuda.empty_cache()
             except Exception as e:
-                logger.error(f"Error generating segment {i+1}: {e}")
-                # If we have some output, return it
                 if final_audio is not None:
-                    break
-                # Otherwise, try a simpler approach for at least some output
-                return self._generate_short_audio(prompt, min(segment_duration, total_duration), params)
-        # Apply a smooth fade-out at the end
-        if final_audio.shape[1] > self.sampling_rate:
-            fade_samples = min(int(1.0 * self.sampling_rate), final_audio.shape[1] // 10)  # 1-second fade out
-            fade_out = np.linspace(1.0, 0.0, fade_samples)**0.5  # Smooth curve
             for ch in range(final_audio.shape[0]):
                 final_audio[ch, -fade_samples:] *= fade_out
-        # Trim to requested duration if needed
-        max_samples = int(total_duration * self.sampling_rate)
-        if final_audio.shape[1] > max_samples:
-            final_audio = final_audio[:, :max_samples]
         return final_audio

             logger.info(f"Requested duration: {duration} seconds")
             # Generate audio
+            if duration <= self.max_segment_duration - 5:  # For short durations, generate in one go
                 audio_output = self._generate_short_audio(prompt, duration, parameters)
             else:
+                # Use basic segmentation for longer durations
+                audio_output = self._generate_long_audio(prompt, duration, parameters)
             # Monitor GPU memory after generation
             allocated = torch.cuda.memory_allocated() / 1e9
             # Generate audio
             logger.info(f"Generation parameters: {generation_kwargs}")
+            outputs = self.model.generate(**inputs, **generation_kwargs)
             # Return audio
             return outputs[0].cpu().numpy()
                 ).to("cuda")
                 # Generate with minimal parameters
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=True,
+                    guidance_scale=1.0  # Minimal guidance
+                )
                 return outputs[0].cpu().numpy()
             except Exception as e2:
                 logger.error(f"Second attempt failed: {e2}")
                 raise e2
+    def _simple_crossfade(self, segment1, segment2, overlap_samples):
+        """Apply a simple linear crossfade between segments."""
         # Get the length of the segments
         length1 = segment1.shape[1]
         length2 = segment2.shape[1]
         # Copy the non-overlapping part of segment2
         result[:, length1:] = segment2[:, overlap_samples:]
+        # Apply simple linear crossfade to the overlapping parts
         if overlap_samples > 0:
+            # Linear fade factors
+            fade_out = np.linspace(1, 0, overlap_samples)
+            fade_in = np.linspace(0, 1, overlap_samples)
             # Get the overlapping parts
             segment1_end = segment1[:, -overlap_samples:].copy()
         return result
+    def _advanced_crossfade(self, segment1, segment2, overlap_samples):
+        """Apply an advanced equal-power crossfade between segments."""
+        # Get the length of the segments
+        length1 = segment1.shape[1]
+        length2 = segment2.shape[1]
+        # Ensure we have enough samples for crossfading
+        overlap_samples = min(overlap_samples, length1, length2)
+        # Create the result array (total length minus overlap)
+        result_length = length1 + length2 - overlap_samples
+        result = np.zeros((segment1.shape[0], result_length), dtype=segment1.dtype)
+        # Copy the non-overlapping part of segment1
+        result[:, :length1-overlap_samples] = segment1[:, :length1-overlap_samples]
+        # Copy the non-overlapping part of segment2
+        result[:, length1:] = segment2[:, overlap_samples:]
+        # Apply equal-power crossfade to the overlapping parts
+        if overlap_samples > 0:
+            # Equal power crossfade curves (cosine/sine based)
+            t = np.linspace(0, np.pi/2, overlap_samples)
+            fade_out = np.cos(t)**2
+            fade_in = np.sin(t)**2
+            # Get the overlapping parts
+            segment1_end = segment1[:, -overlap_samples:].copy()
+            segment2_start = segment2[:, :overlap_samples].copy()
+            # Apply the fades
+            for ch in range(segment1_end.shape[0]):
+                segment1_end[ch] *= fade_out
+                segment2_start[ch] *= fade_in
+            # Combine the faded parts
+            crossfaded = segment1_end + segment2_start
+            # Add to the result
+            result[:, length1-overlap_samples:length1] = crossfaded
+        return result
+    def _generate_long_audio(self, prompt, total_duration, params):
+        """Generate long audio with improved segment continuity."""
+        # Overlap duration for crossfade
+        overlap_duration = 5  # Using a longer overlap for better transitions
+        # Initialize variables
+        remaining_duration = total_duration
+        final_audio = None
+        segment_idx = 0
+        # Calculate number of segments needed
+        segment_duration = self.max_segment_duration
+        overlap_samples = int(overlap_duration * self.sampling_rate)
+        # Process in segments
+        while remaining_duration > 0:
             # Calculate segment duration
+            target_duration = min(segment_duration, remaining_duration + (segment_idx > 0) * overlap_duration)
+            logger.info(f"Generating segment {segment_idx+1}, duration: {target_duration:.1f}s")
             try:
+                # The main change: We directly use continuation prompts without trying prompt_audio
+                if segment_idx == 0:
+                    # First segment with basic prompt
                     segment_prompt = prompt
                 else:
+                    # Subsequent segments with enhanced continuation prompt
+                    segment_prompt = f"{prompt} [continuing segment {segment_idx+1}, seamless continuation]"
+                # Process text for this segment
                 inputs = self.processor(
                     text=[segment_prompt],
                     padding=True,
                 ).to("cuda")
                 # Calculate max_new_tokens from duration
+                max_new_tokens = int(target_duration * 50)
+                # Generation parameters for transformers implementation
+                generation_kwargs = {
+                    "max_new_tokens": max_new_tokens,
+                    "do_sample": True,
+                    "guidance_scale": 3.0
+                }
+                # Add additional parameters if provided
+                if "top_k" in params:
+                    generation_kwargs["top_k"] = min(int(params["top_k"]), 500)
+                if "temperature" in params:
+                    temp = float(params["temperature"])
+                    if temp > 0.1:
+                        generation_kwargs["temperature"] = min(temp, 1.5)
+                if "guidance_scale" in params:
+                    generation_kwargs["guidance_scale"] = min(float(params["guidance_scale"]), 3.0)
+                elif "cfg_coef" in params:
+                    generation_kwargs["guidance_scale"] = min(float(params["cfg_coef"]), 3.0)
+                # Generate this segment
+                outputs = self.model.generate(**inputs, **generation_kwargs)
+                segment_output = outputs[0].cpu().numpy()
+                # Add this segment to our final output
+                if segment_idx == 0:
+                    final_audio = segment_output
                 else:
+                    # Apply advanced crossfade for better transitions
+                    final_audio = self._advanced_crossfade(final_audio, segment_output, overlap_samples)
+                # Update remaining duration
+                if segment_idx == 0:
+                    remaining_duration -= target_duration
+                else:
+                    remaining_duration -= (target_duration - overlap_duration)
                 # Clear CUDA cache
                 torch.cuda.empty_cache()
+                # Log progress
+                logger.info(f"GPU memory usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+                logger.info(f"Remaining duration: {remaining_duration:.1f}s")
             except Exception as e:
+                logger.error(f"Error generating segment {segment_idx+1}: {e}")
                 if final_audio is not None:
+                    logger.info("Returning partial audio after error")
+                    return final_audio
+                # Try again with minimal parameters
+                try:
+                    logger.info("Trying minimal generation parameters")
+                    inputs = self.processor(
+                        text=[prompt],
+                        padding=True,
+                        return_tensors="pt",
+                    ).to("cuda")
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_new_tokens=int(min(target_duration, 15.0) * 50),
+                        do_sample=True
+                    )
+                    return outputs[0].cpu().numpy()
+                except Exception as e2:
+                    logger.error(f"Minimal generation also failed: {e2}")
+                    raise e2
+            # Move to next segment
+            segment_idx += 1
+            # Break if we've generated enough audio
+            if remaining_duration <= 0:
+                break
+        # Apply a smooth fade out to the last 0.5 seconds
+        if final_audio.shape[1] > self.sampling_rate // 2:
+            fade_samples = self.sampling_rate // 2  # 0.5 seconds
+            fade_out = np.linspace(1.0, 0.0, fade_samples)**0.7  # Smooth curve
             for ch in range(final_audio.shape[0]):
                 final_audio[ch, -fade_samples:] *= fade_out
+        # Return the final audio
         return final_audio