Healthydater
/

audiocraft-musicgenmelody

Model card Files Files and versions Community

Phoenixak99 commited on Mar 19

Commit

c5073cf

verified ·

1 Parent(s): ac244c7

Update handler.py

Browse files

Files changed (1) hide show

handler.py +67 -196

handler.py CHANGED Viewed

@@ -1,59 +1,16 @@
-from typing import Dict, Any, List, Optional
 import torch
 import numpy as np
 import json
-import time
-import gc
-from threading import Lock
-import os
 class EndpointHandler:
     def __init__(self, path=""):
-        """Initialize model on startup with optimized GPU usage"""
         try:
             from audiocraft.models import MusicGen
-            # Configure PyTorch for better GPU performance
-            torch.backends.cudnn.benchmark = True  # Enable cuDNN auto-tuner
-            torch.backends.cuda.matmul.allow_tf32 = True  # Allow TF32 on Ampere+ GPUs
-            torch.backends.cudnn.allow_tf32 = True  # Allow TF32 for cuDNN
-            # Set environment variables for better GPU performance
-            os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"  # Optimize CUDA connections
             # Load model - using melody model which supports text and melody inputs
-            # Can also use 'facebook/musicgen-large' for even higher quality at more GPU usage
             self.model = MusicGen.get_pretrained('melody')
-            # Get GPU memory stats before model optimization
-            gpu_mem_before = torch.cuda.memory_allocated() / (1024 ** 3)  # in GB
-            print(f"GPU memory used before optimization: {gpu_mem_before:.2f} GB")
-            # Optimize model for inference
-            self.model.eval()  # Set model to evaluation mode
-            # Use mixed precision for faster inference
-            self.fp16_mode = True
-            if self.fp16_mode:
-                self.model = self.model.half()  # Convert to FP16 for faster inference
-                print("Model converted to FP16 for faster inference")
-            # Optional: Use torch.compile() for PyTorch 2.0+ (significant speedup)
-            try:
-                if hasattr(torch, 'compile'):
-                    # For PyTorch 2.0+, enable torch.compile for faster inference
-                    self.model.lm = torch.compile(self.model.lm, mode="reduce-overhead")
-                    print("Using torch.compile() for optimized inference")
-            except Exception as compile_error:
-                print(f"Warning: torch.compile optimization failed: {compile_error}")
-            # Cache the model on GPU
-            self.model = self.model.cuda()
-            # Apply CUDA graph optimization for repeated workloads of the same size
-            self.use_cuda_graphs = False  # Enable if generating fixed-size outputs repeatedly
-            # Track model sample rate
             self.sample_rate = self.model.sample_rate
             # Set default generation parameters
@@ -62,57 +19,13 @@ class EndpointHandler:
                 top_k=250,
                 duration=30  # Default segment length
             )
-            # Create a batch processing queue for multiple requests
-            self.batch_size = 1  # Can be increased for batch processing
-            self.request_lock = Lock()  # Lock for thread safety
-            # Get GPU memory after optimization
-            torch.cuda.synchronize()  # Ensure GPU operations are complete
-            gpu_mem_after = torch.cuda.memory_allocated() / (1024 ** 3)  # in GB
-            print(f"GPU memory used after optimization: {gpu_mem_after:.2f} GB")
-            print(f"Additional memory used: {gpu_mem_after - gpu_mem_before:.2f} GB")
-            # Warm up the model with a dummy forward pass
-            self._warmup_model()
         except Exception as e:
             # Keep critical error logging only
             print(f"CRITICAL: Failed to initialize model: {e}")
             raise
-    def _warmup_model(self):
-        """Perform a warm-up inference to initialize CUDA kernels"""
-        try:
-            print("Warming up model with dummy inference...")
-            start_time = time.time()
-            # Create a simple prompt for warm-up
-            dummy_prompt = ["warm up"]
-            # Set minimal duration for warm-up
-            self.model.set_generation_params(duration=1)
-            # Run inference with dummy input
-            with torch.cuda.amp.autocast(enabled=self.fp16_mode):
-                with torch.no_grad():
-                    _ = self.model.generate(dummy_prompt, progress=False)
-            # Synchronize GPU to ensure completion
-            torch.cuda.synchronize()
-            # Clear GPU cache after warm-up
-            torch.cuda.empty_cache()
-            end_time = time.time()
-            print(f"Model warm-up completed in {end_time - start_time:.2f} seconds")
-        except Exception as e:
-            print(f"Warning: Model warm-up failed: {e}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """Handle prediction requests with optimized GPU processing"""
-        start_time = time.time()
         try:
             # Parse request data
             inputs = data.get("inputs", {})
@@ -138,18 +51,8 @@ class EndpointHandler:
             if not prompt:
                 return {"error": "No prompt provided"}
-            # Generate music with optimized GPU processing
-            with self.request_lock:  # Ensure thread safety
-                audio_data = self.generate_music(prompt, duration, parameters)
-            # Log processing time for monitoring performance
-            end_time = time.time()
-            processing_time = end_time - start_time
-            print(f"Request processed in {processing_time:.2f} seconds (duration: {duration:.1f}s)")
-            # Log GPU memory usage for monitoring
-            gpu_mem = torch.cuda.memory_allocated() / (1024 ** 3)  # in GB
-            print(f"Current GPU memory usage: {gpu_mem:.2f} GB")
             return {
                 "generated_audio": audio_data.tolist(),
@@ -161,17 +64,12 @@ class EndpointHandler:
         except Exception as e:
             print(f"ERROR: Request processing failed: {e}")
-            end_time = time.time()
-            print(f"Failed request took {end_time - start_time:.2f} seconds")
             return {"error": str(e)}
     def generate_music(self, prompt: str, duration: float, parameters: Dict) -> np.ndarray:
-        """
-        Generate music with proper continuation for long sequences
-        Optimized for maximum GPU utilization
-        """
         try:
-            # Generation parameters with performance optimizations
             segment_duration = min(30, duration)  # Max segment length (30s)
             overlap = 5  # Overlap between segments in seconds
@@ -185,10 +83,6 @@ class EndpointHandler:
                 "cfg_coef": parameters.get("cfg_coef", 3.0)
             }
-            # Additional parameters for GPU optimization
-            guidance_scale = parameters.get("guidance_scale", generation_params["cfg_coef"])
-            generation_params["cfg_coef"] = guidance_scale  # Support both parameter names
             # Set generation parameters
             self.model.set_generation_params(**generation_params)
@@ -196,91 +90,68 @@ class EndpointHandler:
             if isinstance(prompt, str):
                 prompt = [prompt]
-            # Use torch.no_grad and autocast for optimized inference
-            with torch.no_grad():
-                with torch.cuda.amp.autocast(enabled=self.fp16_mode):
-                    # Generate first segment with timing
-                    segment_start = time.time()
-                    segment = self.model.generate(prompt, progress=False)
-                    torch.cuda.synchronize()  # Ensure generation is complete
-                    segment_end = time.time()
-                    print(f"First segment ({segment_duration}s) generated in {segment_end - segment_start:.2f} seconds")
-                    # If duration is less than or equal to segment_duration, we're done
-                    if duration <= segment_duration:
-                        # Trim to exact requested duration if needed
-                        max_samples = int(duration * self.sample_rate)
-                        if segment.shape[2] > max_samples:
-                            segment = segment[:, :, :max_samples]
-                        audio_data = segment.detach().cpu().float()[0].numpy()
-                        return audio_data
-                    # Track remaining duration for multi-segment generation
-                    remaining_duration = duration - segment_duration + overlap
-                    segment_count = 1
-                    print(f"Multi-segment generation needed. Total segments: ~{1 + int(remaining_duration / (segment_duration - overlap))}")
-                    # Continue generating segments until we reach desired duration
-                    while remaining_duration > 0:
-                        # Adjust segment duration for last segment if needed
-                        if remaining_duration < segment_duration - overlap:
-                            current_segment_duration = remaining_duration + overlap
-                            self.model.set_generation_params(
-                                use_sampling=generation_params["use_sampling"],
-                                top_k=generation_params["top_k"],
-                                top_p=generation_params["top_p"],
-                                temperature=generation_params["temperature"],
-                                duration=current_segment_duration,
-                                cfg_coef=generation_params["cfg_coef"]
-                            )
-                        # Extract last few seconds of current segment for continuation
-                        last_seconds = segment[:, :, -overlap*self.sample_rate:]
-                        # Generate continuation with timing
-                        cont_start = time.time()
-                        next_segment = self.model.generate_continuation(
-                            last_seconds,
-                            self.sample_rate,
-                            prompt,
-                            progress=False
-                        )
-                        torch.cuda.synchronize()  # Ensure generation is complete
-                        cont_end = time.time()
-                        # Join segments (removing overlap from first segment)
-                        segment = torch.cat([segment[:, :, :-overlap*self.sample_rate], next_segment], 2)
-                        # Update remaining duration
-                        if remaining_duration < segment_duration - overlap:
-                            seg_duration = remaining_duration
-                            remaining_duration = 0
-                        else:
-                            seg_duration = segment_duration - overlap
-                            remaining_duration -= seg_duration
-                        segment_count += 1
-                        print(f"Segment {segment_count} ({seg_duration:.1f}s) generated in {cont_end - cont_start:.2f} seconds")
-                    # Trim to exact requested duration if needed
-                    max_samples = int(duration * self.sample_rate)
-                    if segment.shape[2] > max_samples:
-                        segment = segment[:, :, :max_samples]
-                    # Convert to numpy array
-                    audio_data = segment.detach().cpu().float()[0].numpy()
-                    # Optional memory cleanup after large generations
-                    if duration > 60:
-                        # Clear GPU cache after large generations
-                        torch.cuda.empty_cache()
-                    return audio_data
         except Exception as e:
             print(f"ERROR: Music generation failed: {e}")
-            # Clean up GPU memory on error
-            torch.cuda.empty_cache()
             raise

+from typing import Dict, Any
 import torch
 import numpy as np
 import json
 class EndpointHandler:
     def __init__(self, path=""):
+        """Initialize model on startup"""
         try:
             from audiocraft.models import MusicGen
             # Load model - using melody model which supports text and melody inputs
             self.model = MusicGen.get_pretrained('melody')
             self.sample_rate = self.model.sample_rate
             # Set default generation parameters
                 top_k=250,
                 duration=30  # Default segment length
             )
         except Exception as e:
             # Keep critical error logging only
             print(f"CRITICAL: Failed to initialize model: {e}")
             raise
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Handle prediction requests"""
         try:
             # Parse request data
             inputs = data.get("inputs", {})
             if not prompt:
                 return {"error": "No prompt provided"}
+            # Generate music
+            audio_data = self.generate_music(prompt, duration, parameters)
             return {
                 "generated_audio": audio_data.tolist(),
         except Exception as e:
             print(f"ERROR: Request processing failed: {e}")
             return {"error": str(e)}
     def generate_music(self, prompt: str, duration: float, parameters: Dict) -> np.ndarray:
+        """Generate music with proper continuation for long sequences"""
         try:
+            # Generation parameters
             segment_duration = min(30, duration)  # Max segment length (30s)
             overlap = 5  # Overlap between segments in seconds
                 "cfg_coef": parameters.get("cfg_coef", 3.0)
             }
             # Set generation parameters
             self.model.set_generation_params(**generation_params)
             if isinstance(prompt, str):
                 prompt = [prompt]
+            # Generate first segment
+            segment = self.model.generate(prompt, progress=False)  # Disabled progress tracking
+            # If duration is less than or equal to segment_duration, we're done
+            if duration <= segment_duration:
+                # Trim to exact requested duration if needed
+                max_samples = int(duration * self.sample_rate)
+                if segment.shape[2] > max_samples:
+                    segment = segment[:, :, :max_samples]
+                audio_data = segment.detach().cpu().float()[0].numpy()
+                return audio_data
+            # Track remaining duration for multi-segment generation
+            remaining_duration = duration - segment_duration + overlap
+            segment_count = 1
+            # Continue generating segments until we reach desired duration
+            while remaining_duration > 0:
+                # Adjust segment duration for last segment if needed
+                if remaining_duration < segment_duration - overlap:
+                    current_segment_duration = remaining_duration + overlap
+                    self.model.set_generation_params(
+                        use_sampling=generation_params["use_sampling"],
+                        top_k=generation_params["top_k"],
+                        top_p=generation_params["top_p"],
+                        temperature=generation_params["temperature"],
+                        duration=current_segment_duration,
+                        cfg_coef=generation_params["cfg_coef"]
+                    )
+                # Extract last few seconds of current segment for continuation
+                last_seconds = segment[:, :, -overlap*self.sample_rate:]
+                # Generate continuation
+                next_segment = self.model.generate_continuation(
+                    last_seconds,
+                    self.sample_rate,
+                    prompt,
+                    progress=False  # Disabled progress tracking
+                )
+                # Join segments (removing overlap from first segment)
+                segment = torch.cat([segment[:, :, :-overlap*self.sample_rate], next_segment], 2)
+                # Update remaining duration
+                if remaining_duration < segment_duration - overlap:
+                    remaining_duration = 0
+                else:
+                    remaining_duration -= (segment_duration - overlap)
+                segment_count += 1
+            # Trim to exact requested duration if needed
+            max_samples = int(duration * self.sample_rate)
+            if segment.shape[2] > max_samples:
+                segment = segment[:, :, :max_samples]
+            # Convert to numpy array
+            audio_data = segment.detach().cpu().float()[0].numpy()
+            return audio_data
         except Exception as e:
             print(f"ERROR: Music generation failed: {e}")
             raise