Spaces:

vincentamato
/

ARIA

Running

App Files Files Community

vincentamato commited on Jan 25

Commit

69defc9

1 Parent(s): e15e4d5

Initial commit

Browse files

Files changed (36) hide show

.gitignore +41 -0
README.md +3 -3
app.py +376 -0
aria/aria.py +121 -0
aria/generate.py +61 -0
aria/image_encoder.py +91 -0
examples/happy.jpg +0 -0
examples/sad.jpeg +0 -0
midi_emotion/.gitignore +6 -0
midi_emotion/LICENSE.md +653 -0
midi_emotion/readme.md +66 -0
midi_emotion/requirements.txt +8 -0
midi_emotion/setup.py +13 -0
midi_emotion/src/config.py +156 -0
midi_emotion/src/create_dataset/hdf5_getters.py +476 -0
midi_emotion/src/create_dataset/run.py +476 -0
midi_emotion/src/create_dataset/utils.py +216 -0
midi_emotion/src/data/collate.py +82 -0
midi_emotion/src/data/data_processing.py +247 -0
midi_emotion/src/data/data_processing_reverse.py +81 -0
midi_emotion/src/data/loader.py +206 -0
midi_emotion/src/data/loader_exhaustive.py +173 -0
midi_emotion/src/data/loader_generations.py +107 -0
midi_emotion/src/data/preprocess_features.py +107 -0
midi_emotion/src/data/preprocess_pianorolls.py +82 -0
midi_emotion/src/generate.py +403 -0
midi_emotion/src/models/build_model.py +48 -0
midi_emotion/src/models/music_continuous_token.py +275 -0
midi_emotion/src/models/music_multi.py +269 -0
midi_emotion/src/models/music_regression.py +250 -0
midi_emotion/src/models/transfer_model.py +49 -0
midi_emotion/src/models/transformer.py +56 -0
midi_emotion/src/train.py +477 -0
midi_emotion/src/utils.py +148 -0
packages.txt +2 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Generated files
+output/
+model_cache/
+*.wav
+*.mid
+# Example files are tracked normally (no LFS needed)
+!examples/

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: ARIA
-emoji: 🦀
 colorFrom: indigo
-colorTo: gray
 sdk: gradio
 sdk_version: 5.13.1
 app_file: app.py

 ---
+title: Aria
+emoji: 📉
 colorFrom: indigo
+colorTo: red
 sdk: gradio
 sdk_version: 5.13.1
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import os
+import sys
+import gradio as gr
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image
+from huggingface_hub import hf_hub_download
+import pretty_midi
+import librosa
+import soundfile as sf
+from midi2audio import FluidSynth
+from aria.image_encoder import ImageEncoder
+from aria.aria import ARIA
+print("Checking model files...")
+# Pre-download all model files at startup
+MODEL_FILES = {
+    "image_encoder": "image_encoder.pt",
+    "continuous_concat": ["continuous_concat/model.pt", "continuous_concat/mappings.pt", "continuous_concat/model_config.pt"],
+    "continuous_token": ["continuous_token/model.pt", "continuous_token/mappings.pt", "continuous_token/model_config.pt"],
+    "discrete_token": ["discrete_token/model.pt", "discrete_token/mappings.pt", "discrete_token/model_config.pt"]
+}
+# Create cache directory
+CACHE_DIR = os.path.join(os.path.dirname(__file__), "model_cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Download and cache all files
+cached_files = {}
+for model_type, files in MODEL_FILES.items():
+    if isinstance(files, str):
+        files = [files]
+    cached_files[model_type] = []
+    for file in files:
+        try:
+            # Check if file already exists in cache
+            repo_id = "vincentamato/aria"
+            cached_path = os.path.join(CACHE_DIR, repo_id, file)
+            if os.path.exists(cached_path):
+                print(f"Using cached file: {file}")
+                cached_files[model_type].append(cached_path)
+            else:
+                print(f"Downloading file: {file}")
+                cached_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file,
+                    cache_dir=CACHE_DIR
+                )
+                cached_files[model_type].append(cached_path)
+        except Exception as e:
+            print(f"Error with file {file}: {str(e)}")
+print("Model files ready.")
+# Global model cache
+models = {}
+def create_emotion_plot(valence, arousal):
+    """Create a valence-arousal plot with the predicted emotion point"""
+    fig = plt.figure(figsize=(8, 8), dpi=100)
+    ax = fig.add_subplot(111)
+    # Set background color and style
+    plt.style.use('default')  # Use default style instead of seaborn
+    fig.patch.set_facecolor('#ffffff')
+    ax.set_facecolor('#ffffff')
+    # Create the coordinate system with a light grid
+    ax.grid(True, linestyle='--', alpha=0.2)
+    ax.axhline(y=0, color='#666666', linestyle='-', alpha=0.3, linewidth=1)
+    ax.axvline(x=0, color='#666666', linestyle='-', alpha=0.3, linewidth=1)
+    # Plot region
+    circle = plt.Circle((0, 0), 1, fill=False, color='#666666', alpha=0.3, linewidth=1.5)
+    ax.add_artist(circle)
+    # Add labels with nice fonts
+    font = {'family': 'sans-serif', 'weight': 'medium', 'size': 12}
+    label_dist = 1.35  # Increased distance for labels
+    ax.text(label_dist, 0, 'Positive', ha='left', va='center', **font)
+    ax.text(-label_dist, 0, 'Negative', ha='right', va='center', **font)
+    ax.text(0, label_dist, 'Excited', ha='center', va='bottom', **font)
+    ax.text(0, -label_dist, 'Calm', ha='center', va='top', **font)
+    # Plot the point with a nice style
+    ax.scatter([valence], [arousal], c='#4f46e5', s=150, zorder=5, alpha=0.8)
+    # Set limits and labels with more padding
+    ax.set_xlim(-1.6, 1.6)
+    ax.set_ylim(-1.6, 1.6)
+    # Format ticks
+    ax.set_xticks([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5])
+    ax.set_yticks([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5])
+    ax.tick_params(axis='both', which='major', labelsize=10)
+    # Add axis labels with padding
+    ax.set_xlabel('Valence', **font, labelpad=15)
+    ax.set_ylabel('Arousal', **font, labelpad=15)
+    # Remove spines
+    for spine in ax.spines.values():
+        spine.set_visible(False)
+    # Adjust layout with more padding
+    plt.tight_layout(pad=1.5)
+    return fig
+def get_model(conditioning_type):
+    """Get or initialize model with specified conditioning"""
+    if conditioning_type not in models:
+        try:
+            # Use cached files
+            image_model_path = cached_files["image_encoder"][0]
+            midi_model_dir = os.path.dirname(cached_files[conditioning_type][0])
+            models[conditioning_type] = ARIA(
+                image_model_checkpoint=image_model_path,
+                midi_model_dir=midi_model_dir,
+                conditioning=conditioning_type
+            )
+        except Exception as e:
+            print(f"Error initializing {conditioning_type} model: {str(e)}")
+            return None
+    return models[conditioning_type]
+def convert_midi_to_wav(midi_path):
+    """Convert MIDI file to WAV using FluidSynth"""
+    wav_path = midi_path.replace('.mid', '.wav')
+    # If WAV file already exists and is newer than MIDI file, use cached version
+    if os.path.exists(wav_path) and os.path.getmtime(wav_path) > os.path.getmtime(midi_path):
+        return wav_path
+    try:
+        # Check common soundfont locations
+        soundfont_paths = [
+            '/usr/share/sounds/sf2/FluidR3_GM.sf2',  # Linux
+            '/usr/share/soundfonts/default.sf2',      # Linux alternative
+            '/usr/local/share/fluidsynth/generaluser.sf2',  # macOS
+            'C:\\soundfonts\\generaluser.sf2'         # Windows
+        ]
+        soundfont = None
+        for sf_path in soundfont_paths:
+            if os.path.exists(sf_path):
+                soundfont = sf_path
+                break
+        if soundfont is None:
+            raise RuntimeError("No SoundFont file found. Please install fluid-soundfont-gm package.")
+        # Convert MIDI to WAV using FluidSynth with explicit soundfont
+        fs = FluidSynth(sound_font=soundfont)
+        fs.midi_to_audio(midi_path, wav_path)
+        return wav_path
+    except Exception as e:
+        print(f"Error converting MIDI to WAV: {str(e)}")
+        return None
+def generate_music(image, conditioning_type, gen_len, temperature, top_p, min_instruments):
+    """Generate music from input image"""
+    model = get_model(conditioning_type)
+    if model is None:
+        return {
+            emotion_chart: None,
+            midi_output: None,
+            results: f"⚠️ Error: Failed to initialize {conditioning_type} model. Please check the logs."
+        }
+    try:
+        # Create output directory with absolute path
+        output_dir = os.path.join(os.path.dirname(__file__), "output")
+        os.makedirs(output_dir, exist_ok=True)
+        # Generate music
+        valence, arousal, midi_path = model.generate(
+            image_path=image,
+            out_dir=output_dir,
+            gen_len=gen_len,
+            temperature=temperature,
+            top_k=-1,
+            top_p=float(top_p),
+            min_instruments=int(min_instruments)
+        )
+        # Ensure we have the absolute path to the MIDI file
+        if not os.path.isabs(midi_path):
+            midi_path = os.path.join(output_dir, midi_path)
+        # Convert MIDI to WAV for playback
+        wav_path = convert_midi_to_wav(midi_path)
+        if wav_path is None:
+            return {
+                emotion_chart: None,
+                midi_output: None,
+                results: "⚠️ Error: Failed to convert MIDI to WAV for playback"
+            }
+        # Create emotion plot
+        emotion_fig = create_emotion_plot(valence, arousal)
+        return {
+            emotion_chart: emotion_fig,
+            midi_output: wav_path,
+            results: f"""
+            **Model Type:** {conditioning_type}
+            **Predicted Emotions:**
+            - Valence: {valence:.3f} (negative → positive)
+            - Arousal: {arousal:.3f} (calm → excited)
+            **Generation Parameters:**
+            - Temperature: {temperature}
+            - Top-p: {top_p}
+            - Min Instruments: {min_instruments}
+            Your music has been generated! Click the play button above to listen.
+            """
+        }
+    except Exception as e:
+        return {
+            emotion_chart: None,
+            midi_output: None,
+            results: f"⚠️ Error generating music: {str(e)}"
+        }
+# Create Gradio interface
+with gr.Blocks(title="ARIA - Art to Music Generator", theme=gr.themes.Soft(
+        primary_hue="indigo",
+        secondary_hue="slate",
+        font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"]
+    )) as demo:
+    gr.Markdown("""
+    # 🎨 ARIA: Artistic Rendering of Images into Audio
+    Upload an image and ARIA will analyze its emotional content to generate matching music!
+    ### How it works:
+    1. ARIA first analyzes the emotional content of your image along two dimensions:
+       - **Valence**: How positive or negative the emotion is (-1 to 1)
+       - **Arousal**: How calm or excited the emotion is (-1 to 1)
+    2. These emotions are then used to generate music that matches the mood
+    """)
+    with gr.Row():
+        with gr.Column(scale=3):
+            image_input = gr.Image(
+                type="filepath",
+                label="Upload Image"
+            )
+            with gr.Group():
+                gr.Markdown("### Generation Settings")
+                with gr.Row():
+                    with gr.Column():
+                        conditioning_type = gr.Radio(
+                            choices=["continuous_concat", "continuous_token", "discrete_token"],
+                            value="continuous_concat",
+                            label="Conditioning Type",
+                            info="How the emotional information is incorporated into the music generation"
+                        )
+                    with gr.Column():
+                        gen_len = gr.Slider(
+                            minimum=256,
+                            maximum=4096,
+                            value=1024,
+                            step=256,
+                            label="Generation Length",
+                            info="Number of tokens to generate (longer = more music)"
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        note_temperature = gr.Slider(
+                            minimum=0.1,
+                            maximum=2.0,
+                            value=1.2,
+                            step=0.1,
+                            label="Note Temperature",
+                            info="Controls randomness of note generation"
+                        )
+                    with gr.Column():
+                        rest_temperature = gr.Slider(
+                            minimum=0.1,
+                            maximum=2.0,
+                            value=1.2,
+                            step=0.1,
+                            label="Rest Temperature",
+                            info="Controls randomness of rest/timing generation"
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        top_p = gr.Slider(
+                            minimum=0.1,
+                            maximum=1.0,
+                            value=0.6,
+                            step=0.1,
+                            label="Top-p Sampling",
+                            info="Nucleus sampling threshold - lower = more focused"
+                        )
+                    with gr.Column():
+                        min_instruments = gr.Slider(
+                            minimum=1,
+                            maximum=5,
+                            value=2,
+                            step=1,
+                            label="Minimum Instruments",
+                            info="Minimum number of instruments in the generated music"
+                        )
+            generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg")
+            # Add examples
+            gr.Examples(
+                examples=[
+                    ["examples/happy.jpg", "continuous_concat", 1024, 1.2, 1.2, 0.6, 2],
+                    ["examples/sad.jpeg", "continuous_token", 1024, 1.2, 1.2, 0.6, 2],
+                ],
+                inputs=[image_input, conditioning_type, gen_len, note_temperature, rest_temperature, top_p, min_instruments],
+                label="Try these examples"
+            )
+        with gr.Column(scale=2):
+            emotion_chart = gr.Plot(
+                label="Predicted Emotions"
+            )
+            midi_output = gr.Audio(
+                type="filepath",
+                label="Generated Music"
+            )
+            results = gr.Markdown()
+    gr.Markdown("""
+    ### About ARIA
+    ARIA is a deep learning system that generates music from artwork by:
+    1. Using a image emotion model to extract emotional content from images
+    2. Generating matching music using an emotion-conditioned music generation model
+    The emotion-conditioned MIDI generation model is based on the work by Serkan Sulun et al. in their paper
+    ["Symbolic music generation conditioned on continuous-valued emotions"](https://ieeexplore.ieee.org/document/9762257).
+    Original implementation: [github.com/serkansulun/midi-emotion](https://github.com/serkansulun/midi-emotion)
+    ### Conditioning Types
+    - **continuous_concat**: Emotions are concatenated with music features (recommended)
+    - **continuous_token**: Emotions are added as special tokens
+    - **discrete_token**: Emotions are discretized into tokens
+    """)
+    def generate_music_wrapper(image, conditioning_type, gen_len, note_temp, rest_temp, top_p, min_instruments):
+        """Wrapper for generate_music that handles separate temperatures"""
+        return generate_music(
+            image=image,
+            conditioning_type=conditioning_type,
+            gen_len=gen_len,
+            temperature=[float(note_temp), float(rest_temp)],
+            top_p=top_p,
+            min_instruments=min_instruments
+        )
+    generate_btn.click(
+        fn=generate_music_wrapper,
+        inputs=[image_input, conditioning_type, gen_len, note_temperature, rest_temperature, top_p, min_instruments],
+        outputs=[emotion_chart, midi_output, results]
+    )
+# Launch app
+demo.launch(share=True)

aria/aria.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import os
+from PIL import Image
+import numpy as np
+import datetime
+from .image_encoder import ImageEncoder
+# Add MIDI emotion model path to Python path
+import sys
+MIDI_EMOTION_PATH = os.path.join(os.path.dirname(__file__), "..", "midi_emotion", "src")
+sys.path.append(MIDI_EMOTION_PATH)
+class ARIA:
+    """ARIA model that generates music from images based on emotional content."""
+    def __init__(
+        self,
+        image_model_checkpoint: str,
+        midi_model_dir: str,
+        conditioning: str = "continuous_concat",
+        device: str = None
+    ):
+        """Initialize ARIA model.
+        Args:
+            image_model_checkpoint: Path to image emotion model checkpoint
+            midi_model_dir: Path to midi emotion model directory
+            conditioning: Type of conditioning to use (continuous_concat, continuous_token, discrete_token)
+            device: Device to run on (default: auto-detect)
+        """
+        self.device = torch.device("cuda" if torch.cuda.is_available() and not device == "cpu" else "cpu")
+        self.conditioning = conditioning
+        # Load image emotion model
+        self.image_model = ImageEncoder()
+        checkpoint = torch.load(image_model_checkpoint, map_location=self.device, weights_only=True)
+        self.image_model.load_state_dict(checkpoint["model_state_dict"])
+        self.image_model.eval()
+        # Import midi generation
+        from midi_emotion.src.generate import generate
+        from midi_emotion.src.models.build_model import build_model
+        self.generate_midi = generate
+        # Load midi model
+        model_fp = os.path.join(midi_model_dir, 'model.pt')
+        mappings_fp = os.path.join(midi_model_dir, 'mappings.pt')
+        config_fp = os.path.join(midi_model_dir, 'model_config.pt')
+        self.maps = torch.load(mappings_fp, weights_only=True)
+        config = torch.load(config_fp, weights_only=True)
+        self.midi_model, _ = build_model(None, load_config_dict=config)
+        self.midi_model = self.midi_model.to(self.device)
+        self.midi_model.load_state_dict(torch.load(model_fp, map_location=self.device, weights_only=True))
+        self.midi_model.eval()
+    def generate(
+        self,
+        image_path: str,
+        out_dir: str = "output",
+        gen_len: int = 2048,
+        temperature: list = [1.2, 1.2],
+        top_k: int = -1,
+        top_p: float = 0.7,
+        min_instruments: int = 2
+    ) -> tuple[float, float, str]:
+        """Generate music from an image.
+        Args:
+            image_path: Path to input image
+            out_dir: Directory to save generated MIDI
+            gen_len: Length of generation in tokens
+            temperature: Temperature for sampling [note_temp, rest_temp]
+            top_k: Top-k sampling (-1 to disable)
+            top_p: Top-p sampling threshold
+            min_instruments: Minimum number of instruments required
+        Returns:
+            Tuple of (valence, arousal, midi_path)
+        """
+        # Get emotion from image
+        image = Image.open(image_path).convert("RGB")
+        with torch.no_grad():
+            valence, arousal = self.image_model(image)
+            valence = valence.squeeze().cpu().item()
+            arousal = arousal.squeeze().cpu().item()
+        # Create output directory
+        os.makedirs(out_dir, exist_ok=True)
+        # Generate MIDI
+        continuous_conditions = np.array([[valence, arousal]], dtype=np.float32)
+        # Generate timestamp for filename (for reference)
+        now = datetime.datetime.now()
+        timestamp = now.strftime("%Y_%m_%d_%H_%M_%S")
+        # Generate the MIDI
+        self.generate_midi(
+            model=self.midi_model,
+            maps=self.maps,
+            device=self.device,
+            out_dir=out_dir,
+            conditioning=self.conditioning,
+            continuous_conditions=continuous_conditions,
+            gen_len=gen_len,
+            temperatures=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            min_n_instruments=min_instruments
+        )
+        # Find the most recently generated MIDI file
+        midi_files = [f for f in os.listdir(out_dir) if f.endswith('.mid')]
+        if midi_files:
+            # Sort by creation time and get most recent
+            midi_path = os.path.join(out_dir, max(midi_files, key=lambda f: os.path.getctime(os.path.join(out_dir, f))))
+            return valence, arousal, midi_path
+        raise RuntimeError("Failed to generate MIDI file")

aria/generate.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import argparse
+from src.models.aria.aria import ARIA
+def main():
+    parser = argparse.ArgumentParser(description="Generate music from images based on emotional content")
+    parser.add_argument("--image", type=str, required=True,
+                        help="Path to input image")
+    parser.add_argument("--image_model_checkpoint", type=str, required=True,
+                        help="Path to image emotion model checkpoint")
+    parser.add_argument("--midi_model_dir", type=str, required=True,
+                        help="Path to midi emotion model directory")
+    parser.add_argument("--out_dir", type=str, default="output",
+                        help="Directory to save generated MIDI")
+    parser.add_argument("--gen_len", type=int, default=512,
+                        help="Length of generation in tokens")
+    parser.add_argument("--temperature", type=float, nargs=2, default=[1.2, 1.2],
+                        help="Temperature for sampling [note_temp, rest_temp]")
+    parser.add_argument("--top_k", type=int, default=-1,
+                        help="Top-k sampling (-1 to disable)")
+    parser.add_argument("--top_p", type=float, default=0.7,
+                        help="Top-p sampling threshold")
+    parser.add_argument("--min_instruments", type=int, default=1,
+                        help="Minimum number of instruments required")
+    parser.add_argument("--cpu", action="store_true",
+                        help="Force CPU inference")
+    parser.add_argument("--conditioning", type=str, required=True,
+                        choices=["none", "discrete_token", "continuous_token", "continuous_concat"],
+                        help="Type of conditioning to use")
+    parser.add_argument("--batch_size", type=int, default=1,
+                        help="Number of samples to generate (not used for image input)")
+    args = parser.parse_args()
+    # Initialize model
+    model = ARIA(
+        image_model_checkpoint=args.image_model_checkpoint,
+        midi_model_dir=args.midi_model_dir,
+        conditioning=args.conditioning,
+        device="cpu" if args.cpu else None
+    )
+    # Generate music
+    valence, arousal, midi_path = model.generate(
+        image_path=args.image,
+        out_dir=args.out_dir,
+        gen_len=args.gen_len,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        min_instruments=args.min_instruments
+    )
+    # Print results
+    print(f"\nPredicted emotions:")
+    print(f"Valence: {valence:.3f} (negative -> positive)")
+    print(f"Arousal: {arousal:.3f} (calm -> excited)")
+    print(f"\nGenerated MIDI saved to: {midi_path}")
+if __name__ == "__main__":
+    main()

aria/image_encoder.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+from typing import Tuple, Union
+class ImageEncoder(nn.Module):
+    def __init__(self, clip_model_name: str = "openai/clip-vit-large-patch14-336"):
+        """Initialize the image encoder using CLIP.
+        Args:
+            clip_model_name: HuggingFace model name for CLIP
+        """
+        super().__init__()
+        # Load CLIP model and processor
+        self.clip_model = CLIPModel.from_pretrained(clip_model_name)
+        self.processor = CLIPProcessor.from_pretrained(clip_model_name)
+        # Freeze CLIP parameters
+        for param in self.clip_model.parameters():
+            param.requires_grad = False
+        # Add projection layers for valence and arousal
+        hidden_dim = self.clip_model.config.projection_dim
+        projection_dim = hidden_dim // 2
+        self.valence_head = nn.Sequential(
+            nn.Linear(hidden_dim, projection_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(projection_dim, projection_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(projection_dim // 2, 1),
+            nn.Tanh()  # Output between -1 and 1
+        )
+        self.arousal_head = nn.Sequential(
+            nn.Linear(hidden_dim, projection_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(projection_dim, projection_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(projection_dim // 2, 1),
+            nn.Tanh()  # Output between -1 and 1
+        )
+        # Move model to GPU if available
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(self.device)
+    def forward(self, images: Union[Image.Image, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass to get valence and arousal predictions.
+        Args:
+            images: Either PIL images or tensors in CLIP format
+        Returns:
+            Tuple of predicted valence and arousal scores
+        """
+        # Process images if they're PIL images
+        if isinstance(images, Image.Image):
+            inputs = self.processor(images=images, return_tensors="pt")
+            pixel_values = inputs.pixel_values.to(self.device)
+        else:
+            pixel_values = images.to(self.device)
+        # Get CLIP image features
+        image_features = self.clip_model.get_image_features(pixel_values)
+        # Project to valence and arousal scores
+        valence = self.valence_head(image_features)
+        arousal = self.arousal_head(image_features)
+        return valence, arousal
+    def encode_image(self, image: Image.Image) -> torch.Tensor:
+        """Get the raw CLIP image embeddings.
+        Args:
+            image: PIL image to encode
+        Returns:
+            Image embedding tensor
+        """
+        inputs = self.processor(images=image, return_tensors="pt")
+        with torch.no_grad():
+            image_features = self.clip_model.get_image_features(inputs.pixel_values.to(self.device))
+        return image_features

examples/happy.jpg ADDED Viewed

examples/sad.jpeg ADDED Viewed

midi_emotion/.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+.vscode
+data_files/*
+output/*
+!.gitkeep
+.cache

midi_emotion/LICENSE.md ADDED Viewed

	@@ -0,0 +1,653 @@

+Copyright © 2022 INESC TEC
+Emotion-based MIDI generator: Uses deep neural networks to create symbolic music (MIDI) based on user-defined emotions from the valence-arousal plane.
+This software is authored by:
+Serkan Sulun
+This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
+A commercial license is also available for use in industrial projects and collaborations that do not wish to use the GPL v3 license.
+To obtain the commercial license please contact the INESC TEC Tech-nology Licensing Office (TLO) at [email protected], or
+Campus da Faculdade de Engenharia da Universidade do Porto
+Rua Dr. Roberto Frias
+4200-465 Porto
+Portugal
+If needed SAL (INESC TEC Technology Licensing Office - TLO) can assist with all the legal details regarding the licensing agreement
+If you use Emotion-based MIDI generator in a work that leads to a scientific publication, we would appreciate it if you would kindly cite Emotion-based MIDI generator in your manuscript.
+S. Sulun, M. E. P. Davies and P. Viana, "Symbolic Music Generation Conditioned on Continuous-Valued Emotions," in IEEE Access, vol. 10, pp. 44617-44626, 2022, doi: 10.1109/ACCESS.2022.3169744.
+The paper can be found at https://ieeexplore.ieee.org/document/9762257
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.

midi_emotion/readme.md ADDED Viewed

	@@ -0,0 +1,66 @@

+Generates multi-instrument symbolic music (MIDI), based on user-provided emotions from valence-arousal plane. In simpler words, it can generate happy (positive valence, positive arousal), calm (positive valence, negative arousal), angry (negative valence, positive arousal) or sad (negative valence, negative arousal) music.
+Source code for our paper "Symbolic music generation conditioned on continuous-valued emotions",
+Serkan Sulun, Matthew E. P. Davies, Paula Viana, 2022.
+https://ieeexplore.ieee.org/document/9762257
+To cite:
+```S. Sulun, M. E. P. Davies and P. Viana, "Symbolic music generation conditioned on continuous-valued emotions," in IEEE Access, doi: 10.1109/ACCESS.2022.3169744.```
+Required Python libraries: Numpy, Pytorch, Pandas, pretty_midi, Pypianoroll, tqdm, Spotipy, Pytables. Or run: ```pip install -r requirements.txt```
+To create the Lakh-Spotify dataset:
+- Go to the ```src/create_dataset``` folder
+- Download the datasets:
+[Lakh pianoroll 5 full dataset](https://ucsdcloud-my.sharepoint.com/personal/h3dong_ucsd_edu/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fh3dong%5Fucsd%5Fedu%2FDocuments%2Fdata%2Flpd%2Flpd%5F5%2Flpd%5F5%5Ffull%2Etar%2Egz&parent=%2Fpersonal%2Fh3dong%5Fucsd%5Fedu%2FDocuments%2Fdata%2Flpd%2Flpd%5F5&ga=1)
+MSD summary file
+http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/msd_summary_file.h5
+Echonest mapping dataset
+```ftp://ftp.acousticbrainz.org/pub/acousticbrainz/acousticbrainz-labs/download/msdrosetta/millionsongdataset_echonest.tar.bz2```
+Alternatively: https://drive.google.com/file/d/17Exfxjtq7bI9EKtEZlOrBCkx8RBx7h77/view?usp=sharing
+Lakh-MSD matching scores file
+http://hog.ee.columbia.edu/craffel/lmd/match_scores.json
+- Extract when necessary, and place all inside folder ```./data_files```
+- Get Spotify client ID and client secret:
+https://developer.spotify.com/dashboard/applications
+Then, fill in the variables "client_id" and "client_secret" in ```src/create_dataset/utils.py```
+- Run ```run.py```.
+To preprocess and create the training dataset:
+- Go to the ```src/data``` folder and run ```preprocess_pianorolls.py```
+To generate MIDI using pretrained models:
+- Download model(s) from the following link:
+https://drive.google.com/drive/folders/1R5-HaXmNzXBAhGq1idrDF-YEKkZm5C8C?usp=sharing
+- Extract into the folder ```output```
+- Go to ```src``` folder and run ```generate.py``` with appropriate arguments. e.g:
+```python generate.py --model_dir continuous_concat --conditioning continuous_concat --valence -0.8, -0.8 0.8 0.8 --arousal -0.8 -0.8 0.8 0.8```
+To train:
+- Go to ```src``` folder and run ```train.py``` with appropriate arguments. e.g:
+```python train.py --conditioning continuous_concat```
+There are 4 different conditioning modes:
+```none```: No conditioning, vanilla model.
+```discrete_token```: Conditioning using discrete tokens, i.e. control tokens.
+```continuous_token```: Conditioning using continuous values embedded as vectors, then prepended to the other embedded tokens in sequence dimension.
+```continuous_concat```: Conditioning using continuous values embedded as vectors, then concatenated to all other embedded tokens in channel dimension.
+See ```config.py``` for all options.

midi_emotion/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy==1.21.0
+pandas==1.2.5
+pretty-midi==0.2.9
+pypianoroll==1.0.4
+spotipy==2.19.0
+tables==3.6.1
+torch==2.1.0
+tqdm==4.61.1

midi_emotion/setup.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from setuptools import setup, find_packages
+setup(
+    name="midi_emotion",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "torch",
+        "numpy",
+        "pretty_midi",
+        "tqdm"
+    ]
+)

midi_emotion/src/config.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import time
+import argparse
+parser = argparse.ArgumentParser(description='Generates emotion-based symbolic music')
+parser.add_argument("--conditioning", type=str, required=False, default="continuous_concat",
+                    choices=["none", "discrete_token", "continuous_token",
+                             "continuous_concat"], help='Conditioning type')
+parser.add_argument("--data_folder", type=str, default="../data_files/lpd_5/lpd_5_full_transposable")
+parser.add_argument('--full_dataset', action="store_true",
+                    help='Use LPD-full dataset')
+parser.add_argument('--n_layer', type=int, default=20,
+                    help='number of total layers')
+parser.add_argument('--n_head', type=int, default=16,
+                    help='number of heads')
+parser.add_argument('--d_model', type=int, default=768,
+                    help='model dimension')
+parser.add_argument('--d_condition', type=int, default=192,
+                    help='condition dimension (if continuous_concat is used)')
+parser.add_argument('--d_inner', type=int, default=768*4,
+                    help='inner dimension in FF')
+parser.add_argument('--tgt_len', type=int, default=1216,
+                    help='number of tokens to predict')
+parser.add_argument('--max_gen_input_len', type=int, default=-1,
+                    help='number of tokens to predict')
+parser.add_argument('--gen_len', type=int, default=2048,
+                    help='Generation length')
+parser.add_argument('--temp_note', type=float, default=1.2,
+                    help='Temperature for generating notes')
+parser.add_argument('--temp_rest', type=float, default=1.2,
+                    help='Temperature for generating rests')
+parser.add_argument('--n_bars', type=int, default=-1,
+                    help='number of bars to use')
+parser.add_argument('--no_pad', action='store_true',
+                    help='dont pad sequences')
+parser.add_argument('--eval_tgt_len', type=int, default=-1,
+                    help='number of tokens to predict for evaluation')
+parser.add_argument('--dropout', type=float, default=0.1,
+                    help='global dropout rate')
+parser.add_argument("--overwrite_dropout", action="store_true",
+                    help="resets dropouts")
+parser.add_argument('--lr', type=float, default=2e-5,
+                    help='initial learning rate (0.00025|5 for adam|sgd)')
+parser.add_argument("--overwrite_lr", action="store_true",
+                    help="Overwrites learning rate if pretrained model is loaded.")
+parser.add_argument('--arousal_feature', default='note_density', type=str,
+                    choices=['tempo', 'note_density'],
+                    help='Feature to use as arousal feature')
+parser.add_argument('--scheduler', default='constant', type=str,
+                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant', "cyclic"],
+                    help='lr scheduler to use.')
+parser.add_argument('--lr_min', type=float, default=5e-6,
+                    help='minimum learning rate for cyclic scheduler')
+parser.add_argument('--lr_max', type=float, default=5e-3,
+                    help='maximum learning rate for cyclic scheduler')
+parser.add_argument('--warmup_step', type=int, default=0,
+                    help='upper epoch limit')
+parser.add_argument('--decay_rate', type=float, default=0.5,
+                    help='decay factor when ReduceLROnPlateau is used')
+parser.add_argument('--clip', type=float, default=1.0,
+                    help='gradient clipping')
+parser.add_argument('--batch_size', type=int, default=4,
+                    help='batch size')
+parser.add_argument('--accumulate_step', type=int, default=1,
+                    help='accumulate gradients (multiplies effective batch size')
+parser.add_argument('--seed', type=int, default=-1,
+                    help='random seed')
+parser.add_argument('--no_cuda', action='store_true',
+                    help='use CPU')
+parser.add_argument('--log_step', type=int, default=1000,
+                    help='report interval')
+parser.add_argument('--eval_step', type=int, default=8000,
+                    help='evaluation interval')
+parser.add_argument('--max_eval_step', type=int, default=1000,
+                    help='maximum evaluation steps')
+parser.add_argument('--gen_step', type=int, default=8000,
+                    help='generation interval')
+parser.add_argument('--work_dir', default='../output', type=str,
+                    help='experiment directory.')
+parser.add_argument('--restart_dir', type=str, default=None,
+                    help='restart dir')
+parser.add_argument('--debug', action='store_true',
+                    help='run in debug mode (do not create exp dir)')
+parser.add_argument('--max_step', type=int, default=1000000000,
+                    help='maximum training steps')
+parser.add_argument('--overfit', action='store_true',
+                    help='Works on a single sample')
+parser.add_argument('--find_lr', action='store_true',
+                    help='Run learning rate finder')
+parser.add_argument('--num_workers', default=8, type=int,
+                    help='Number of cores for data loading')
+parser.add_argument('--bar_start_prob', type=float, default=0.5,
+                    help=('probability of training sample'
+                    ' starting at a bar location'))
+parser.add_argument("--n_samples", type=int, default=-1,
+                    help="Limits number of training samples (for faster debugging)")
+parser.add_argument('--n_emotion_bins', type=int, default=5,
+                    help='Number of emotion bins in each dimension')
+parser.add_argument('--max_transpose', type=int, default=3,
+                    help='Maximum transpose amount')
+parser.add_argument('--no_amp', action="store_true",
+                    help='Disable automatic mixed precision')
+parser.add_argument('--reset_scaler', action="store_true",
+                    help="Reset scaler (can help avoiding nans)")
+parser.add_argument('--exhaustive_eval', action="store_true",
+                    help="Use data exhaustively (for final evaluation)")
+parser.add_argument('--regression', action="store_true",
+                    help="Train a regression model")
+parser.add_argument("--always_use_discrete_condition", action="store_true",
+                help="Discrete tokens are used for every sequence")
+parser.add_argument("--regression_dir", type=str, default=None,
+                    help="The path of folder with generations, to perform regression on")
+args = parser.parse_args()
+if args.regression_dir is not None:
+    args.regression = True
+if args.conditioning != "continuous_concat":
+    args.d_condition = -1
+assert not (args.exhaustive_eval and args.max_eval_step > 0)
+if args.full_dataset:
+    assert args.conditioning in ["discrete_token", "none"] and not args.regression, "LPD-full has NaN features"
+if args.regression:
+    args.n_layer = 8
+    print("Using 8 layers for regression")
+args.batch_chunk = -1
+if args.debug or args.overfit:
+    args.num_workers = 0
+if args.find_lr:
+    args.debug = True
+args.d_embed = args.d_model
+if args.eval_tgt_len < 0:
+    args.eval_tgt_len = args.tgt_len
+if args.scheduler == "cyclic":
+    args.lr = args.lr_min
+if args.restart_dir:
+    args.restart_dir = os.path.join(args.work_dir, args.restart_dir)
+if args.debug:
+    args.work_dir = os.path.join(args.work_dir, "DEBUG_" + time.strftime('%Y%m%d-%H%M%S'))
+elif args.no_cuda:
+    args.work_dir = os.path.join(args.work_dir, "CPU_" + time.strftime('%Y%m%d-%H%M%S'))
+else:
+    args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))

midi_emotion/src/create_dataset/hdf5_getters.py ADDED Viewed

	@@ -0,0 +1,476 @@

+"""
+Thierry Bertin-Mahieux (2010) Columbia University
+[email protected]
+This code contains a set of getters functions to access the fields
+from an HDF5 song file (regular file with one song or
+aggregate / summary file with many songs)
+This is part of the Million Song Dataset project from
+LabROSA (Columbia University) and The Echo Nest.
+Copyright 2010, Thierry Bertin-Mahieux
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import tables
+def open_h5_file_read(h5filename):
+    """
+    Open an existing H5 in read mode.
+    Same function as in hdf5_utils, here so we avoid one import
+    """
+    return tables.open_file(h5filename, mode='r')
+def get_num_songs(h5):
+    """
+    Return the number of songs contained in this h5 file, i.e. the number of rows
+    for all basic informations like name, artist, ...
+    """
+    return h5.root.metadata.songs.nrows
+def get_artist_familiarity(h5,songidx=0):
+    """
+    Get artist familiarity from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_familiarity[songidx]
+def get_artist_hotttnesss(h5,songidx=0):
+    """
+    Get artist hotttnesss from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_hotttnesss[songidx]
+def get_artist_id(h5,songidx=0):
+    """
+    Get artist id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_id[songidx]
+def get_artist_mbid(h5,songidx=0):
+    """
+    Get artist musibrainz id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_mbid[songidx]
+def get_artist_playmeid(h5,songidx=0):
+    """
+    Get artist playme id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_playmeid[songidx]
+def get_artist_7digitalid(h5,songidx=0):
+    """
+    Get artist 7digital id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_7digitalid[songidx]
+def get_artist_latitude(h5,songidx=0):
+    """
+    Get artist latitude from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_latitude[songidx]
+def get_artist_longitude(h5,songidx=0):
+    """
+    Get artist longitude from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_longitude[songidx]
+def get_artist_location(h5,songidx=0):
+    """
+    Get artist location from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_location[songidx]
+def get_artist_name(h5,songidx=0):
+    """
+    Get artist name from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.artist_name[songidx]
+def get_release(h5,songidx=0):
+    """
+    Get release from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.release[songidx]
+def get_release_7digitalid(h5,songidx=0):
+    """
+    Get release 7digital id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.release_7digitalid[songidx]
+def get_song_id(h5,songidx=0):
+    """
+    Get song id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.song_id[songidx]
+def get_song_hotttnesss(h5,songidx=0):
+    """
+    Get song hotttnesss from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.song_hotttnesss[songidx]
+def get_title(h5,songidx=0):
+    """
+    Get title from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.title[songidx]
+def get_track_7digitalid(h5,songidx=0):
+    """
+    Get track 7digital id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.metadata.songs.cols.track_7digitalid[songidx]
+def get_similar_artists(h5,songidx=0):
+    """
+    Get similar artists array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.metadata.songs.nrows == songidx + 1:
+        return h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:]
+    return h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:
+                                            h5.root.metadata.songs.cols.idx_similar_artists[songidx+1]]
+def get_artist_terms(h5,songidx=0):
+    """
+    Get artist terms array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.metadata.songs.nrows == songidx + 1:
+        return h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
+    return h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
+                                            h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]
+def get_artist_terms_freq(h5,songidx=0):
+    """
+    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.metadata.songs.nrows == songidx + 1:
+        return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
+    return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
+                                              h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]
+def get_artist_terms_weight(h5,songidx=0):
+    """
+    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.metadata.songs.nrows == songidx + 1:
+        return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
+    return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
+                                                h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]
+def get_analysis_sample_rate(h5,songidx=0):
+    """
+    Get analysis sample rate from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.analysis_sample_rate[songidx]
+def get_audio_md5(h5,songidx=0):
+    """
+    Get audio MD5 from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.audio_md5[songidx]
+def get_danceability(h5,songidx=0):
+    """
+    Get danceability from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.danceability[songidx]
+def get_duration(h5,songidx=0):
+    """
+    Get duration from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.duration[songidx]
+def get_end_of_fade_in(h5,songidx=0):
+    """
+    Get end of fade in from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.end_of_fade_in[songidx]
+def get_energy(h5,songidx=0):
+    """
+    Get energy from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.energy[songidx]
+def get_key(h5,songidx=0):
+    """
+    Get key from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.key[songidx]
+def get_key_confidence(h5,songidx=0):
+    """
+    Get key confidence from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.key_confidence[songidx]
+def get_loudness(h5,songidx=0):
+    """
+    Get loudness from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.loudness[songidx]
+def get_mode(h5,songidx=0):
+    """
+    Get mode from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.mode[songidx]
+def get_mode_confidence(h5,songidx=0):
+    """
+    Get mode confidence from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.mode_confidence[songidx]
+def get_start_of_fade_out(h5,songidx=0):
+    """
+    Get start of fade out from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.start_of_fade_out[songidx]
+def get_tempo(h5,songidx=0):
+    """
+    Get tempo from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.tempo[songidx]
+def get_time_signature(h5,songidx=0):
+    """
+    Get signature from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.time_signature[songidx]
+def get_time_signature_confidence(h5,songidx=0):
+    """
+    Get signature confidence from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.time_signature_confidence[songidx]
+def get_track_id(h5,songidx=0):
+    """
+    Get track id from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.analysis.songs.cols.track_id[songidx]
+def get_segments_start(h5,songidx=0):
+    """
+    Get segments start array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.segments_start[h5.root.analysis.songs.cols.idx_segments_start[songidx]:]
+    return h5.root.analysis.segments_start[h5.root.analysis.songs.cols.idx_segments_start[songidx]:
+                                           h5.root.analysis.songs.cols.idx_segments_start[songidx+1]]
+def get_segments_confidence(h5,songidx=0):
+    """
+    Get segments confidence array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.segments_confidence[h5.root.analysis.songs.cols.idx_segments_confidence[songidx]:]
+    return h5.root.analysis.segments_confidence[h5.root.analysis.songs.cols.idx_segments_confidence[songidx]:
+                                                h5.root.analysis.songs.cols.idx_segments_confidence[songidx+1]]
+def get_segments_pitches(h5,songidx=0):
+    """
+    Get segments pitches array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:,:]
+    return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:
+                                             h5.root.analysis.songs.cols.idx_segments_pitches[songidx+1],:]
+def get_segments_timbre(h5,songidx=0):
+    """
+    Get segments timbre array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:,:]
+    return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:
+                                            h5.root.analysis.songs.cols.idx_segments_timbre[songidx+1],:]
+def get_segments_loudness_max(h5,songidx=0):
+    """
+    Get segments loudness max array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.segments_loudness_max[h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx]:]
+    return h5.root.analysis.segments_loudness_max[h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx]:
+                                                  h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx+1]]
+def get_segments_loudness_max_time(h5,songidx=0):
+    """
+    Get segments loudness max time array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.segments_loudness_max_time[h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx]:]
+    return h5.root.analysis.segments_loudness_max_time[h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx]:
+                                                       h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx+1]]
+def get_segments_loudness_start(h5,songidx=0):
+    """
+    Get segments loudness start array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.segments_loudness_start[h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx]:]
+    return h5.root.analysis.segments_loudness_start[h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx]:
+                                                    h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx+1]]
+def get_sections_start(h5,songidx=0):
+    """
+    Get sections start array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.sections_start[h5.root.analysis.songs.cols.idx_sections_start[songidx]:]
+    return h5.root.analysis.sections_start[h5.root.analysis.songs.cols.idx_sections_start[songidx]:
+                                           h5.root.analysis.songs.cols.idx_sections_start[songidx+1]]
+def get_sections_confidence(h5,songidx=0):
+    """
+    Get sections confidence array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.sections_confidence[h5.root.analysis.songs.cols.idx_sections_confidence[songidx]:]
+    return h5.root.analysis.sections_confidence[h5.root.analysis.songs.cols.idx_sections_confidence[songidx]:
+                                                h5.root.analysis.songs.cols.idx_sections_confidence[songidx+1]]
+def get_beats_start(h5,songidx=0):
+    """
+    Get beats start array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:]
+    return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:
+                                        h5.root.analysis.songs.cols.idx_beats_start[songidx+1]]
+def get_beats_confidence(h5,songidx=0):
+    """
+    Get beats confidence array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.beats_confidence[h5.root.analysis.songs.cols.idx_beats_confidence[songidx]:]
+    return h5.root.analysis.beats_confidence[h5.root.analysis.songs.cols.idx_beats_confidence[songidx]:
+                                             h5.root.analysis.songs.cols.idx_beats_confidence[songidx+1]]
+def get_bars_start(h5,songidx=0):
+    """
+    Get bars start array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:]
+    return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:
+                                       h5.root.analysis.songs.cols.idx_bars_start[songidx+1]]
+def get_bars_confidence(h5,songidx=0):
+    """
+    Get bars start array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.bars_confidence[h5.root.analysis.songs.cols.idx_bars_confidence[songidx]:]
+    return h5.root.analysis.bars_confidence[h5.root.analysis.songs.cols.idx_bars_confidence[songidx]:
+                                            h5.root.analysis.songs.cols.idx_bars_confidence[songidx+1]]
+def get_tatums_start(h5,songidx=0):
+    """
+    Get tatums start array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.tatums_start[h5.root.analysis.songs.cols.idx_tatums_start[songidx]:]
+    return h5.root.analysis.tatums_start[h5.root.analysis.songs.cols.idx_tatums_start[songidx]:
+                                         h5.root.analysis.songs.cols.idx_tatums_start[songidx+1]]
+def get_tatums_confidence(h5,songidx=0):
+    """
+    Get tatums confidence array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.analysis.songs.nrows == songidx + 1:
+        return h5.root.analysis.tatums_confidence[h5.root.analysis.songs.cols.idx_tatums_confidence[songidx]:]
+    return h5.root.analysis.tatums_confidence[h5.root.analysis.songs.cols.idx_tatums_confidence[songidx]:
+                                              h5.root.analysis.songs.cols.idx_tatums_confidence[songidx+1]]
+def get_artist_mbtags(h5,songidx=0):
+    """
+    Get artist musicbrainz tag array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.musicbrainz.songs.nrows == songidx + 1:
+        return h5.root.musicbrainz.artist_mbtags[h5.root.musicbrainz.songs.cols.idx_artist_mbtags[songidx]:]
+    return h5.root.musicbrainz.artist_mbtags[h5.root.metadata.songs.cols.idx_artist_mbtags[songidx]:
+                                             h5.root.metadata.songs.cols.idx_artist_mbtags[songidx+1]]
+def get_artist_mbtags_count(h5,songidx=0):
+    """
+    Get artist musicbrainz tag count array. Takes care of the proper indexing if we are in aggregate
+    file. By default, return the array for the first song in the h5 file.
+    To get a regular numpy ndarray, cast the result to: numpy.array( )
+    """
+    if h5.root.musicbrainz.songs.nrows == songidx + 1:
+        return h5.root.musicbrainz.artist_mbtags_count[h5.root.musicbrainz.songs.cols.idx_artist_mbtags[songidx]:]
+    return h5.root.musicbrainz.artist_mbtags_count[h5.root.metadata.songs.cols.idx_artist_mbtags[songidx]:
+                                                   h5.root.metadata.songs.cols.idx_artist_mbtags[songidx+1]]
+def get_year(h5,songidx=0):
+    """
+    Get release year from a HDF5 song file, by default the first song in it
+    """
+    return h5.root.musicbrainz.songs.cols.year[songidx]

midi_emotion/src/create_dataset/run.py ADDED Viewed

	@@ -0,0 +1,476 @@

+import json
+import pretty_midi
+import pypianoroll
+import hdf5_getters
+from tqdm import tqdm
+import os
+import concurrent.futures
+import collections
+import utils
+from glob import glob
+import pandas as pd
+import csv
+from copy import deepcopy
+"""
+Written by Serkan Sulun
+Creates labels for Lakh MIDI (or pianoroll) dataset.
+Labels include low-level MIDI features such as tempo, note density and number of MIDI files.
+They also include high-level features obtained from Spotify Developer API, such as valence, energy, etc.
+See utils.py and fill in the variables client_id and client_secret.
+When the user quota is exceeded, Spotify blocks access and the script gets stuck.
+In that case, you may need to re-run the script some time later,
+or use a different account with different client ID and secret.
+"""
+def run_parallel(func, my_iter):
+    # Parallel processing visualized with tqdm
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        results = list(tqdm(executor.map(func, my_iter), total=len(my_iter)))
+    return results
+write = False
+redo = True
+main_output_dir = "../../data_files/features"
+os.makedirs(main_output_dir, exist_ok=True)
+match_scores_path = "../../data_files/match_scores.json"
+msd_summary_path = "../../data_files/msd_summary_file.h5"
+echonest_folder_path = "../../data_files/millionsongdataset_echonest"
+use_pianoroll_dataset = True
+if use_pianoroll_dataset:
+    midi_dataset_path = "../../data_files/lpd_full/lpd/lpd_full"
+    extension = ".npz"
+    output_dir = os.path.join(main_output_dir, "pianoroll")
+else:
+    midi_dataset_path = "lmd_full"
+    extension = ".mid"
+    output_dir = os.path.join(main_output_dir, "midi")
+os.makedirs(output_dir, exist_ok=True)
+### PART I: Map track_ids (in midi dataset) to Spotify features
+### 1- Create mappings track_id (in midi dataset) -> metadata (in Echonest)
+output_path = os.path.join(output_dir, "trackid_to_songid.json")
+with open(match_scores_path, "r") as f:
+    match_scores = json.load(f)
+track_ids = sorted(list(match_scores.keys()))
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        trackid_to_songid = json.load(f)
+else:
+    h5_msd = hdf5_getters.open_h5_file_read(msd_summary_path)
+    n_msd = hdf5_getters.get_num_songs(h5_msd)
+    trackid_to_songid = {}
+    print("Adding metadata to each track in Lakh dataset")
+    for i in tqdm(range(n_msd)):
+        track_id = hdf5_getters.get_track_id(h5_msd, i).decode("utf-8")
+        if track_id in track_ids:
+            # get data from MSD
+            song_id = hdf5_getters.get_song_id(h5_msd, i).decode("utf-8")
+            artist = hdf5_getters.get_artist_name(h5_msd, i).decode("utf-8")
+            title = hdf5_getters.get_title(h5_msd, i).decode("utf-8")
+            release = hdf5_getters.get_release(h5_msd, i).decode("utf-8")
+            trackid_to_songid[track_id] = {"song_id": song_id,"title": title,
+                            "artist": artist, "release": release}
+    # sort
+    trackid_to_songid = collections.OrderedDict(sorted(trackid_to_songid.items()))
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(trackid_to_songid, f, indent=4)
+            print(f"Output saved to {output_path}")
+### 2- Create mappings metadata (in Echonest) -> Spotify IDs
+output_path = os.path.join(output_dir, "songid_to_spotify.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        songid_to_spotify = json.load(f)
+else:
+    song_ids = sorted([val["song_id"] for val in trackid_to_songid.values()])
+    songid_to_spotify = {}
+    print("Mapping Echonest song IDs to Spotify song IDs")
+    for song_id in tqdm(song_ids):
+        file_path = os.path.join(echonest_folder_path, song_id[2:4], song_id + ".json")
+        spotify_ids = utils.get_spotify_ids(file_path)
+        songid_to_spotify[song_id] = spotify_ids
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(songid_to_spotify, f, indent=4)
+            print(f"Output saved to {output_path}")
+### 3- Merge and add Spotify features
+output_path = os.path.join(output_dir, "trackid_to_spotify_features.json")
+# When user quota is exceeded, Spotify blocks access and the script gets stuck.
+# In that case, you may need to re-run the script some time later,
+# or use a different account with different client ID and secret.
+# So we keep an incomplete csv file, so that we can continue later from where we left.
+output_path_incomplete = os.path.join(output_dir, "incomplete_trackid_to_spotify_features.csv")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        trackid_to_spotify_features = json.load(f)
+else:
+    fieldnames = ["track_id", "song_id", "title", "artist", "release",
+        "spotify_id", "spotify_title", "spotify_artist", "spotify_album", "spotify_audio_features"]
+    data_to_process = deepcopy(trackid_to_songid)
+    write_header = True
+    if os.path.exists(output_path_incomplete):
+        # Continue from where we've left
+        data_already_processed = utils.read_csv(output_path_incomplete)
+        track_ids_already_processed = [entry["track_id"] for entry in data_already_processed]
+        data_to_process = {key: value for key, value in data_to_process.items() if key not in track_ids_already_processed}
+        write_header = False
+    with open(output_path_incomplete, "a") as f_out:
+        csv_writer = csv.DictWriter(f_out, fieldnames=fieldnames)
+        if write_header:
+            csv_writer.writeheader()
+        print("Adding Spotify features")
+        for track_id, data in tqdm(data_to_process.items()):
+            data["track_id"] = track_id
+            album = data["release"]
+            spotify_ids = songid_to_spotify[data["song_id"]]
+            if spotify_ids == []:
+                # use metadata to search spotify
+                best_spotify_track = utils.search_spotify_flexible(data["title"], data["artist"], data["release"])
+            else:
+                spotify_tracks = utils.get_spotify_tracks(spotify_ids)
+                if spotify_tracks == None:
+                    for key in ["id", "title", "artist", "album", "audio_features"]:
+                        data["spotify_" + key] = None
+                elif len(spotify_tracks) > 1:
+                    # find best spotify id by comparing album names
+                    best_match_score = 0
+                    best_match_ind = 0
+                    for i, track in enumerate(spotify_tracks):
+                        if track is not None:
+                            spotify_album = track["album"]["name"] if track is not None else ""
+                            match_score = utils.matching_strings_flexible(album, spotify_album)
+                            if match_score > best_match_score:
+                                best_match_score = match_score
+                                best_match_ind = i
+                    best_spotify_track = spotify_tracks[best_match_ind]
+                else:
+                    best_spotify_track = spotify_tracks[0]
+            if best_spotify_track is not None:
+                spotify_id = best_spotify_track["uri"].split(":")[-1]
+                spotify_audio_features = utils.get_spotify_features(spotify_id)[0]
+                # if spotify_audio_features["valence"] == 0.0:
+                #     # A large portion of files have 0.0 valence, although they are NaNs
+                #     spotify_audio_features["valence"] = float("nan")
+                spotify_artists = ", ".join([artist["name"] for artist in best_spotify_track["artists"]])
+                data["spotify_id"] = spotify_id
+                data["spotify_title"] = best_spotify_track['name']
+                data["spotify_artist"] = spotify_artists
+                data["spotify_album"] = best_spotify_track["album"]["name"]
+                data["spotify_audio_features"] = spotify_audio_features
+            else:
+                for key in ["id", "title", "artist", "album", "audio_features"]:
+                    data["spotify_" + key] = None
+            csv_writer.writerow(data)
+    # Now write final data to json
+    trackid_to_spotify_features_list = utils.read_csv(output_path_incomplete)
+    trackid_to_spotify_features = {}
+    # unlike json, csv doesnt support dict within dict, so convert it to dict manually
+    for item in trackid_to_spotify_features_list:
+        spotify_audio_features = item["spotify_audio_features"]
+        if spotify_audio_features != "":
+            spotify_audio_features = eval(spotify_audio_features)
+        item["spotify_audio_features"] = spotify_audio_features
+        track_id = deepcopy(item["track_id"])
+        del item["track_id"]
+        trackid_to_spotify_features[track_id] = item
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(trackid_to_spotify_features, f, indent=4)
+            print(f"Output saved to {output_path}")
+### PART II: Dealing with symbolic music data
+### 4- Revert matching scores
+""" Matched data has the format: track_ID -> midi_file
+where multiple tracks could be mapped to a single midi file.
+We want to revert this mapping and then keep unique midi files
+Revert match scores file to have mapping midi_file -> track_ID
+"""
+output_path = os.path.join(output_dir, "match_scores_reverse.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        match_scores_reversed = json.load(f)
+else:
+    with open(match_scores_path, "r") as f:
+        in_data = json.load(f)
+    match_scores_reversed = {}
+    print("Reversing match scores.")
+    for track_id, matching in tqdm(in_data.items()):
+        for file_, score in matching.items():
+            if file_ not in match_scores_reversed.keys():
+                match_scores_reversed[file_] = {track_id: score}
+            else:
+                match_scores_reversed[file_][track_id] = score
+    # order match scores
+    for k in match_scores_reversed.keys():
+        match_scores_reversed[k] = collections.OrderedDict(sorted(match_scores_reversed[k].items(), reverse=True, key=lambda x: x[-1]))
+    # order filenames
+    match_scores_reversed = collections.OrderedDict(sorted(match_scores_reversed.items(), key=lambda x: x[0]))
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(match_scores_reversed, f, indent=4)
+            print(f"Output saved to {output_path}")
+# 5- Filter match scores to only keep best match
+output_path = os.path.join(output_dir, "best_match_scores.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        best_match_scores_reversed = json.load(f)
+else:
+    best_match_scores_reversed = {}
+    print("Selecting best matching tracks.")
+    for midi_file, match in tqdm(match_scores_reversed.items()):
+        best_match_scores_reversed[midi_file] = list(match.items())[0]
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(best_match_scores_reversed, f, indent=4)
+            print(f"Output saved to {output_path}")
+### 6- Filter unique midis
+"""LMD was created by creating hashes for the entire files
+and then keeping files with unique hashes.
+However, some files' musical content are the same, and only their metadata are different.
+So we hash the content (pianoroll array), and further filter out the unique ones."""
+# Create hashes for midis
+output_path = os.path.join(output_dir, "hashes.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        midi_file_to_hash = json.load(f)
+else:
+    def get_hash_and_file(path):
+        hash_ = utils.get_hash(path)
+        file_ = os.path.basename(path)
+        file_ = file_[:-4]
+        return [file_, hash_]
+    file_paths = sorted(glob(midi_dataset_path + "/**/*" + extension, recursive=True))
+    assert len(file_paths) > 0, f"No MIDI files found at {midi_dataset_path}"
+    print("Getting hashes for MIDIs.")
+    midi_file_to_hash = run_parallel(get_hash_and_file, file_paths)
+    midi_file_to_hash = sorted(midi_file_to_hash, key=lambda x:x[0])
+    midi_file_to_hash = dict(midi_file_to_hash)
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(midi_file_to_hash, f, indent=4)
+            print(f"Output saved to {output_path}")
+# also do the reverse hash -> midi
+output_path = os.path.join(output_dir, "unique_files.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        midi_files_unique = json.load(f)
+else:
+    hash_to_midi_file = {}
+    for midi_file, hash in midi_file_to_hash.items():
+        try:
+            best_match_score = best_match_scores_reversed[midi_file][1]
+        except:
+            best_match_score = 0
+        if hash in hash_to_midi_file.keys():
+            hash_to_midi_file[hash].append((midi_file, best_match_score))
+        else:
+            hash_to_midi_file[hash] = [(midi_file, best_match_score)]
+    midi_files_unique = []
+    # Get unique midis (with highest match score)
+    print("Getting unique MIDIs.")
+    for hash, midi_files_and_match_scores in hash_to_midi_file.items():
+        if hash != "empty_pianoroll":
+            midi_files_and_match_scores = sorted(midi_files_and_match_scores, key=lambda x: x[1], reverse=True)
+            midi_files_unique.append(midi_files_and_match_scores[0][0])
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(midi_files_unique, f, indent=4)
+            print(f"Output saved to {output_path}")
+# create unique matched midis list
+midi_files_matched = list(match_scores_reversed.keys())
+output_path = os.path.join(output_dir, "midis_matched_unique.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        midi_files_matched_unique = json.load(f)
+else:
+    print("Getting unique matched MIDIs.")
+    midi_files_matched_unique = sorted(list(set(midi_files_matched).intersection(midi_files_unique)))
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(midi_files_matched_unique, f, indent=4)
+            print(f"Output saved to {output_path}")
+# create unique unmatched midis list
+output_path = os.path.join(output_dir, "midis_unmatched_unique.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        midi_files_unmatched_unique = json.load(f)
+else:
+    print("Getting unique unmatched MIDIs.")
+    midi_files_unmatched_unique = sorted(list(set(midi_files_unique) - set(midi_files_matched_unique)))
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(midi_files_unmatched_unique, f, indent=4)
+            print(f"Output saved to {output_path}")
+### 6- Create mappings: midi -> best matching track ID, spotify features
+output_path = os.path.join(output_dir, "spotify_features.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        midi_file_to_spotify_features = json.load(f)
+else:
+    midi_file_to_spotify_features = {}
+    print("Adding Spotify for matched unique MIDIs.")
+    for pr in tqdm(midi_files_matched_unique):
+        sample_data = {}
+        sample_data["track_id"], sample_data["match_score"] = best_match_scores_reversed[pr]
+        metadata_and_spotify = trackid_to_spotify_features[sample_data["track_id"]]
+        sample_data.update(metadata_and_spotify)
+        midi_file_to_spotify_features[pr] = sample_data
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(midi_file_to_spotify_features, f, indent=4)
+            print(f"Output saved to {output_path}")
+### 7- For all midis, get low level features
+# (tempo, note density, number of instruments)
+output_path = os.path.join(output_dir, "midi_features.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        midi_file_to_midi_features = json.load(f)
+else:
+    def get_midi_features(midi_file):
+        midi_path = os.path.join(midi_dataset_path, midi_file[0], midi_file + extension)
+        if use_pianoroll_dataset:
+            mid = pypianoroll.load(midi_path).to_pretty_midi()
+        else:
+            mid = pretty_midi.PrettyMIDI(midi_path)
+        note_density = utils.get_note_density(mid)
+        tempo = utils.get_tempo(mid)
+        n_instruments = utils.get_n_instruments(mid)
+        duration = mid.get_end_time()
+        midi_features = {
+            "note_density": note_density,
+            "tempo": tempo,
+            "n_instruments": n_instruments,
+            "duration": duration,
+        }
+        return [midi_file, midi_features]
+    print("Getting low-level MIDI features")
+    midi_file_to_midi_features = run_parallel(get_midi_features, midi_files_unique)
+    midi_file_to_midi_features = dict(midi_file_to_midi_features)
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(midi_file_to_midi_features, f, indent=4)
+            print(f"Output saved to {output_path}")
+### 8- Merge MIDI features and matched (Spotify) features
+output_path = os.path.join(output_dir, "full_dataset_features.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        midi_file_to_merged_features = json.load(f)
+else:
+    midi_file_to_merged_features = {}
+    print("Merging MIDI features and Spotify features for full dataset.")
+    for midi_file in tqdm(midi_file_to_midi_features.keys()):
+        midi_file_to_merged_features[midi_file] = {}
+        midi_file_to_merged_features[midi_file]["midi_features"] = midi_file_to_midi_features[midi_file]
+        if midi_file in midi_file_to_spotify_features.keys():
+            matched_features = midi_file_to_spotify_features[midi_file]
+        else:
+            matched_features = {}
+        midi_file_to_merged_features[midi_file]["matched_features"] = matched_features
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(midi_file_to_merged_features, f, indent=4)
+            print(f"Output saved to {output_path}")
+### Do the same for matched dataset
+output_path = os.path.join(output_dir, "matched_dataset_features.json")
+if os.path.exists(output_path) and not redo:
+    with open(output_path, "r") as f:
+        matched_midi_file_to_merged_features = json.load(f)
+else:
+    print("Merging MIDI features and Spotify features for the matched dataset.")
+    matched_midi_file_to_merged_features = \
+        {file_: midi_file_to_merged_features[file_] for file_ in tqdm(midi_files_matched_unique)}
+    if write:
+        with open(output_path, "w") as f:
+            json.dump(matched_midi_file_to_merged_features, f, indent=4)
+            print(f"Output saved to {output_path}")
+### PART III: Constructing training dataset
+### 9- Summarize matched dataset features by only taking valence and note densities per instrument,
+# number of instruments, durations, is_matched
+output_path = os.path.join(output_dir, "full_dataset_features_summarized.csv")
+if not os.path.exists(output_path) or redo:
+    print("Constructing training dataset (final file)")
+    dataset_summarized = []
+    for midi_file, features in tqdm(midi_file_to_merged_features.items()):
+        midi_features = features["midi_features"]
+        n_instruments = midi_features["n_instruments"]
+        note_density_per_instrument = midi_features["note_density"] / n_instruments
+        matched_features = features["matched_features"]
+        if matched_features == {}:
+            is_matched = False
+            valence = float("nan")
+        else:
+            is_matched = True
+            spotify_audio_features = matched_features["spotify_audio_features"]
+            if spotify_audio_features is None or spotify_audio_features == "":
+                valence = float("nan")
+            else:
+                if spotify_audio_features["valence"] == 0.0:
+                    # An unusual number of samples have a valence of 0.0
+                    # which is possibly due to an error. Feel free to comment out.
+                    valence = float("nan")
+                else:
+                    valence = spotify_audio_features["valence"]
+        dataset_summarized.append({
+            "file": midi_file,
+            "is_matched": is_matched,
+            "n_instruments": n_instruments,
+            "note_density_per_instrument": note_density_per_instrument,
+            "valence": valence
+        })
+    dataset_summarized = pd.DataFrame(dataset_summarized)
+    if write:
+        dataset_summarized.to_csv(output_path, index=False)
+        print(f"Output saved to {output_path}")

midi_emotion/src/create_dataset/utils.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import spotipy
+from spotipy.oauth2 import SpotifyClientCredentials
+import re
+import hashlib
+import json
+import pypianoroll
+import numpy as np
+import pretty_midi
+import csv
+"""
+You'll need a client ID and a client secret:
+https://developer.spotify.com/dashboard/applications
+Then, fill in the variables client_id and client_secret
+"""
+client_id = 'c520641b167a4cd0872d48e5232a41e6'
+client_secret = 'a455993eda164da2b67462c2e1382e91'
+client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
+sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
+def get_drums_note_density(mid):
+    drum_mid = pretty_midi.PrettyMIDI()
+    for instrument in mid.instruments:
+        if instrument.is_drum:
+            drum_mid.instruments.append(instrument)
+    if len(drum_mid.instruments) != 1 or len(drum_mid.instruments[0].notes) == 0:
+        return float("nan")
+    else:
+        start_time = drum_mid.instruments[0].notes[0].start
+        end_time = drum_mid.instruments[0].notes[-1].end
+        duration = end_time - start_time
+        n_notes = len(drum_mid.instruments[0].notes)
+        density = n_notes / duration
+        return density
+def get_md5(path):
+    with open(path, "rb") as f:
+        md5 = hashlib.md5(f.read()).hexdigest()
+    return md5
+def get_hash(path):
+    if path[-4:] == ".mid":
+        try:
+            mid = pretty_midi.PrettyMIDI(path)
+        except:
+            return "empty_pianoroll"
+        try:
+            rolls = mid.get_piano_roll()
+        except:
+            return "empty_pianoroll"
+        if rolls.size == 0:
+            return "empty_pianoroll"
+    else:
+        pr = pypianoroll.load(path)
+        tracks = sorted(pr.tracks, key=lambda x: x.name)
+        rolls = [track.pianoroll for track in tracks if track.pianoroll.shape[0] > 0]
+        if rolls == []:
+            return "empty_pianoroll"
+        rolls = np.concatenate(rolls, axis=-1)
+    hash_ = hashlib.sha1(np.ascontiguousarray(rolls)).hexdigest()
+    return hash_
+def get_note_density(mid):
+    duration = mid.get_end_time()
+    n_notes = sum([1 for instrument in mid.instruments for note in instrument.notes])
+    density = n_notes / duration
+    return density
+def get_tempo(mid):
+    tick_scale = mid._tick_scales[-1][-1]
+    resolution = mid.resolution
+    beat_duration = tick_scale * resolution
+    mid_tempo = 60 / beat_duration
+    return mid_tempo
+def get_n_instruments(mid):
+    n_instruments = sum([1 for instrument in mid.instruments if instrument.notes != []])
+    return n_instruments
+def try_multiple(func, *args, **kwargs):
+    n_max = 29
+    n = 0
+    failed = True
+    while failed:
+        if n > n_max:
+            return None
+        try:
+            if args:
+                out = func(*args)
+            elif kwargs:
+                out = func(**kwargs)
+            failed = False
+        except Exception as e:
+            # print(e.error_description)
+            if e.args[0] == 404:
+                return None
+            else:
+                n += 1
+    return out
+def search_spotify(title, artist, album=None):
+    query = '"{}"+artist:"{}"'.format(title, artist)
+    if album is not None:
+        query += '+album:"{}"'.format(album)
+    if len(query) <= 250:
+        result = try_multiple(sp.search, q=query, type='track')
+        items = result['tracks']['items']
+    else:   # spotify doesnt search with a query longer than 250 characters
+        items = []
+    return items
+def search_spotify_flexible(title, artist, album):
+    # Find Spotify URI based on metadata
+    items = search_spotify(title, artist, album)
+    if items == []:
+        items = search_spotify(title, artist)
+    if items == []:
+        title = fix_string(title)
+        items = search_spotify(title, artist)
+    if items == []:
+        artist = fix_string(artist)
+        items = search_spotify(title, artist)
+    if items == []:
+        artist = strip_artist(artist)
+        items = search_spotify(title, artist)
+    if items == []:
+        return None
+    elif len(items) == 1:
+        item = items[0]
+    else:
+        # Return most popular
+        max_popularity = 0
+        best_ind = 0
+        for i, item in enumerate(items):
+            if item is not None:
+                if item["popularity"] > max_popularity:
+                    max_popularity = item["popularity"]
+                    best_ind = i
+        item = items[best_ind]
+    return item
+def matching_strings_flexible(a, b):
+    if a == "" or b == "":
+        matches = 0.0
+    else:
+        a = fix_string(a)
+        b = fix_string(b)
+        a = a.replace("'", "")
+        b = b.replace("'", "")
+        min_len = min(len(a), len(b))
+        matches = 0
+        for i in range(min_len):
+            if a[i] == b[i]:
+                matches += 1
+        matches /= min_len
+    return matches
+def get_spotify_features(uri_list):
+    features = try_multiple(sp.audio_features, uri_list)
+    return features
+def get_spotify_tracks(uri_list):
+    if len(uri_list) > 50:
+        uri_list = uri_list[:50]
+    tracks = try_multiple(sp.tracks, uri_list)
+    if tracks == None:
+        return None
+    else:
+        return tracks["tracks"]
+def strip_artist(s):
+    s = s.lower()   # lowercase
+    s = s.replace("the ", "")
+    keys = [' - ', '/', ' ft', 'feat', 'featuring', ' and ', ' with ', '_', ' vs', '&', ';', '+']
+    for key in keys:
+        loc = s.find(key)
+        if loc != -1:
+            s = s[:loc]
+    return s
+def fix_string(s):
+    if s != "":
+        s = s.lower()   # lowercase
+        s = s.replace('\'s', '')    # remove 's
+        s = s.replace('_', ' ')    # remove _
+        s = re.sub("[\(\[].*?[\)\]]", "", s)    # remove everything in parantheses
+        if s[-1] == " ":    # remove space at the end
+            s = s[:-1]
+    return s
+def logprint(s, f):
+    f.write(s + '\n')
+def get_spotify_ids(json_path):
+    with open(json_path) as f_json:
+        json_data = json.load(f_json)
+        json_data = json_data["response"]["songs"]
+        if len(json_data) == 0:
+            spotify_ids = []
+        else:
+            json_data = json_data[0]
+            spotify_ids = []
+            for track in json_data["tracks"]:
+                if track["catalog"] == "spotify" and "foreign_id" in list(track.keys()):
+                    spotify_ids.append(track["foreign_id"].split(":")[-1])
+    return spotify_ids
+def read_csv(input_file_path, delimiter=","):
+    with open(input_file_path, "r") as f_in:
+        reader = csv.DictReader(f_in, delimiter=delimiter)
+        data = [{key: value for key, value in row.items()} for row in reader]
+    return data

midi_emotion/src/data/collate.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+import re
+# from torch._six import container_abcs, string_classes, int_classes
+from torch._six import string_classes
+import collections
+"""
+Modified by Serkan Sulun
+Filters out None samples
+"""
+""""Contains definitions of the methods used by the _DataLoaderIter workers to
+collate samples fetched from dataset into Tensor(s).
+These **needs** to be in global scope since Py2 doesn't support serializing
+static methods.
+"""
+_use_shared_memory = False
+r"""Whether to use shared memory in batch_collate"""
+np_str_obj_array_pattern = re.compile(r'[SaUO]')
+error_msg_fmt = "batch must contain tensors, numbers, dicts or lists; found {}"
+numpy_type_map = {
+    'float64': torch.DoubleTensor,
+    'float32': torch.FloatTensor,
+    'float16': torch.HalfTensor,
+    'int64': torch.LongTensor,
+    'int32': torch.IntTensor,
+    'int16': torch.ShortTensor,
+    'int8': torch.CharTensor,
+    'uint8': torch.ByteTensor,
+}
+def filter_collate(batch):
+    r"""Puts each data field into a tensor with outer dimension batch size"""
+    if isinstance(batch, list) or isinstance(batch, tuple):
+        batch = [i for i in batch if i is not None]     # filter out None s
+    if batch != []:
+        elem_type = type(batch[0])
+        if isinstance(batch[0], torch.Tensor):
+            out = None
+            if _use_shared_memory:
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = batch[0].storage()._new_shared(numel)
+                out = batch[0].new(storage)
+            return torch.stack(batch, 0, out=out)
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            elem = batch[0]
+            if elem_type.__name__ == 'ndarray':
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(error_msg_fmt.format(elem.dtype))
+                return filter_collate([torch.from_numpy(b) for b in batch])
+            if elem.shape == ():  # scalars
+                py_type = float if elem.dtype.name.startswith('float') else int
+                return numpy_type_map[elem.dtype.name](list(map(py_type, batch)))
+        elif isinstance(batch[0], float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(batch[0], int):
+            return torch.tensor(batch)
+        elif isinstance(batch[0], string_classes):
+            return batch
+        elif isinstance(batch[0], collections.abc.Mapping):
+            return {key: filter_collate([d[key] for d in batch]) for key in batch[0]}
+        elif isinstance(batch[0], tuple) and hasattr(batch[0], '_fields'):  # namedtuple
+            return type(batch[0])(*(filter_collate(samples) for samples in zip(*batch)))
+        elif isinstance(batch[0], collections.abc.Sequence):
+            transposed = zip(*batch)
+            return [filter_collate(samples) for samples in transposed]
+        raise TypeError((error_msg_fmt.format(type(batch[0]))))
+    else:
+        return batch

midi_emotion/src/data/data_processing.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import pypianoroll
+from operator import attrgetter
+import torch
+from copy import deepcopy
+import numpy as np
+# Forward processing. (Midi to indices)
+def read_pianoroll(fp, return_tempo=False):
+    # Reads pianoroll file and converts to PrettyMidi
+    pr = pypianoroll.load(fp)
+    mid = pr.to_pretty_midi()
+    if return_tempo:
+        tempo = np.mean(pr.tempo)
+        return mid, tempo
+    else:
+        return mid
+def trim_midi(mid_orig, start, end, strict=True):
+    """Trims midi file
+    Args:
+        mid (PrettyMidi): input midi file
+        start (float): start time
+        end (float): end time
+        strict (bool, optional):
+            If false, includes notes that starts earlier than start time,
+            and ends later than start time. Or ends later than end time,
+            but starts earlier than end time. The start and end times
+            are readjusted so they fit into the given boundaries.
+            Defaults to True.
+    Returns:
+        (PrettyMidi): Trimmed output MIDI.
+    """
+    eps = 1e-3
+    mid = deepcopy(mid_orig)
+    for ins in mid.instruments:
+        if strict:
+            ins.notes = [note for note in ins.notes if note.start >= start and note.end <= end]
+        else:
+            ins.notes = [note for note in ins.notes \
+                 if note.end > start + eps and note.start < end - eps]
+        for note in ins.notes:
+            if not strict:
+                # readjustment
+                note.start = max(start, note.start)
+                note.end = min(end, note.end)
+            # Make the excerpt start at time zero
+            note.start -= start
+            note.end -= start
+    # Filter out empty tracks
+    mid.instruments = [ins for ins in mid.instruments if ins.notes]
+    return mid
+def mid_to_timed_tuples(music, event_sym2idx, min_pitch: int = 21, max_pitch: int = 108):
+    # for sorting (though not absolutely necessary)
+    on_off_priority = ["ON", "OFF"]
+    ins_priority = ["DRUMS", "BASS", "GUITAR", "PIANO", "STRINGS"]
+    on_off_priority = {val: i for i, val in enumerate(on_off_priority)}
+    ins_priority = {val: i for i, val in enumerate(ins_priority)}
+    # Add instrument info to notes
+    for i, track in enumerate(music.instruments):
+        for note in track.notes:
+            note.instrument = track.name
+    # Collect notes
+    notes = []
+    for track in music.instruments:
+        notes.extend(track.notes)
+    # Raise an error if no notes is found
+    if not notes:
+        raise RuntimeError("No notes found.")
+    # Sort the notes
+    notes.sort(key=attrgetter("start", "pitch", "duration", "velocity", "instrument"))
+    # Collect note-related events
+    note_events = []
+    for note in notes:
+        if note.pitch >= min_pitch and note.pitch <= max_pitch:
+            start = round(note.start, 6)
+            end = round(note.end, 6)
+            ins = note.instrument.upper()
+            note_events.append((start, on_off_priority["ON"],
+                ins_priority[ins], (event_sym2idx["_".join(["ON", ins])], note.pitch)))
+            note_events.append((end, on_off_priority["OFF"],
+                ins_priority[ins], (event_sym2idx["_".join(["OFF", ins])], note.pitch)))
+    # Sort events by time
+    note_events = sorted(note_events)
+    note_events = [(note[0], note[-1]) for note in note_events]
+    return note_events
+def timed_tuples_to_tuples(note_events, event_sym2idx, max_timeshift: int = 1000,
+    timeshift_step: int = 8):
+    # Create a list for all events
+    events = []
+    # Initialize the time cursor
+    time_cursor = int(round(note_events[0][0] * 1000))
+    # Iterate over note events
+    for time, symbol in note_events:
+        time = int(round(time * 1000))
+        if time > time_cursor:
+            timeshift = time - time_cursor
+            # First split timeshifts longer than max
+            n_max = timeshift // max_timeshift
+            for _ in range(n_max):
+                events.append((event_sym2idx["TIMESHIFT"], max_timeshift))
+            # quantize and add remaining
+            rem = timeshift % max_timeshift
+            if rem > 0:
+                # do not round to zero
+                rem = int(timeshift_step * round(float(rem) / timeshift_step))
+                if rem == 0:
+                    rem = timeshift_step    # do not round to zero
+                events.append((event_sym2idx["TIMESHIFT"], rem))
+            time_cursor = time
+        if symbol[0] != "<":    # if not special symbol
+            events.append(symbol)
+    return events
+def list_to_tensor(list_, sym2idx):
+    indices = [sym2idx[sym] for sym in list_]
+    indices = torch.LongTensor(indices)
+    return indices
+def mid_to_bars(mid, event_sym2idx):
+    """Takes MIDI, extracts bars
+    returns ndarray where each row is a token
+    each token has two elements,
+    first is an index of event, such as DRUMS_OFF, or TIMESHIFT
+    second is the value (pitch for note or time for timeshift)
+    """
+    try:
+        bar_times = [round(bar, 6) for bar in mid.get_downbeats()]
+        bar_times.append(bar_times[-1] + (bar_times[-1] - bar_times[-2]))   # to end
+        bar_times.append(bar_times[-1] + (bar_times[-1] - bar_times[-2]))   # to end
+        note_events = mid_to_timed_tuples(mid, event_sym2idx)
+        i_bar = -1
+        i_note = 0
+        bars = []
+        cur_bar_note_events = []
+        cur_bar_end = -float("inf")
+        while i_note < len(note_events):
+            time, note = note_events[i_note]
+            if time < cur_bar_end:
+                cur_bar_note_events.append((time, note))
+                i_note += 1
+            else:
+                cur_bar_note_events.append((cur_bar_end, "<BAR_END>"))
+                if len(cur_bar_note_events) > 2:
+                    events = timed_tuples_to_tuples(cur_bar_note_events, event_sym2idx)
+                    events = tuples_to_array(events)
+                    bars.append(events)
+                i_bar += 1
+                cur_bar_start = bar_times[i_bar]
+                cur_bar_end = bar_times[i_bar+1]
+                cur_bar_note_events = [(cur_bar_start, "<BAR_START>")]
+    except:
+        bars = None
+    return bars
+def tuples_to_array(x):
+    x = [list(el) for el in x]
+    x = np.asarray(x, dtype=np.int16)
+    return x
+def get_maps(min_pitch=21,max_pitch=108,max_timeshift=1000,timeshift_step=8):
+    # Get mapping dictionary
+    instruments = ["DRUMS", "GUITAR", "BASS", "PIANO", "STRINGS"]
+    special_symbols = ["<PAD>", "<START>"]
+    on_offs = ["OFF", "ON"]
+    token_syms = deepcopy(special_symbols)
+    event_syms = []
+    transposable_event_syms = []
+    for ins in instruments:
+        for on_off in on_offs:
+            event_syms.append(f"{on_off}_{ins}")
+            if ins != "DRUMS":
+                transposable_event_syms.append(f"{on_off}_{ins}")
+            for pitch in range(min_pitch, max_pitch + 1):
+                token_syms.append((f"{on_off}_{ins}", pitch))
+    for timeshift in range(timeshift_step, max_timeshift + timeshift_step, timeshift_step):
+        token_syms.append(("TIMESHIFT", timeshift))
+    event_syms.append("TIMESHIFT")
+    map = {}
+    map["event2idx"] = {sym: idx for idx, sym in enumerate(event_syms)}
+    map["idx2event"] = {idx: sym for idx, sym in enumerate(event_syms)}
+    map["tuple2idx"] = {}
+    map["idx2tuple"] = {}
+    for idx, sym in enumerate(token_syms):
+        if isinstance(sym, tuple):
+            indexed_tuple = (map["event2idx"][sym[0]], sym[1])
+        else:
+            indexed_tuple = sym
+        map["tuple2idx"][indexed_tuple] = idx
+        map["idx2tuple"][idx] = indexed_tuple
+    transposable_event_inds = [map["event2idx"][sym] for sym in transposable_event_syms]
+    map["transposable_event_inds"] = transposable_event_inds
+    return map
+def transpose(x, n, transposable_event_inds, min_pitch = 21, max_pitch = 108):
+    # Transpose melody
+    for i in range(x.size(0)):
+        if x[i, 0].item() in transposable_event_inds and \
+            x[i, 1].item() + n <= max_pitch and \
+            x[i, 1].item() + n >= min_pitch:
+            x[i, 1] += n
+    return x
+def tuples_to_ind_tensor(x, tuple2idx):
+    # Tuples to indices
+    x = [tuple2idx[el] for el in x]
+    x = torch.tensor(x, dtype=torch.int16)
+    return x
+def tensor_to_tuples(x):
+    x = [tuple(row.tolist()) for row in x]
+    return x
+def tensor_to_ind_tensor(x, tuple2idx):
+    x = tensor_to_tuples(x)
+    x = tuples_to_ind_tensor(x, tuple2idx)
+    return x

midi_emotion/src/data/data_processing_reverse.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import pretty_midi
+import csv
+# For reverse processing (TOKENS TO MIDI)
+def tensor_to_tuples(x):
+    x = x.tolist()
+    x = [tuple(el) for el in x]
+    return x
+def tuples_to_mid(x, idx2event, verbose=False):
+    # Tuples to midi
+    instrument_to_program = {"DRUMS": (0, True), "PIANO": (0, False), "GUITAR": (24, False),
+                             "BASS": (32, False), "STRINGS": (48, False)}
+    velocities = {
+    "BASS": 127,
+    "DRUMS": 120,
+    "GUITAR": 95,
+    "PIANO": 110,
+    "STRINGS": 85,
+    }
+    tracks = {}
+    for key, val in instrument_to_program.items():
+        track = pretty_midi.Instrument(program=val[0], is_drum=val[1], name=key.lower())
+        track.notes = []
+        tracks.update({key: track})
+    active_notes = {}
+    time_cursor = 0
+    for el in x:
+        if el[0] != "<":     # if not special token
+            event = idx2event[el[0]]
+            if "TIMESHIFT" == event:
+                timeshift = float(el[1])
+                time_cursor += timeshift / 1000.0
+            else:
+                on_off, instrument = event.split("_")
+                pitch = int(el[1])
+                if on_off == "ON":
+                    active_notes.update({(instrument, pitch): time_cursor})
+                elif (instrument, pitch) in active_notes:
+                    start = active_notes[(instrument, pitch)]
+                    end = time_cursor
+                    tracks[instrument].notes.append(pretty_midi.Note(velocities[instrument], pitch, start, end))
+                elif verbose:
+                    print("Ignoring {:>15s} {:4} because there was no previos ""ON"" event".format(event, pitch))
+    mid = pretty_midi.PrettyMIDI()
+    mid.instruments += tracks.values()
+    return mid
+def ind_tensor_to_tuples(x, ind2tuple):
+    # Indices to tuples
+    x = [ind2tuple[el.item()] for el in x]
+    return x
+def tuples_to_str(x, idx2event):
+    # Tuples to strings
+    str_list = []
+    for el in x:
+        if el[0] == "<":    # special token
+            str_list.append(el)
+        else:
+            str_list.append(idx2event[el[0]] + "_" + str(el[1]))
+    return str_list
+def ind_tensor_to_mid(x, idx2tuple, idx2event, verbose=False):
+    # Indices to midi
+    x = ind_tensor_to_tuples(x, idx2tuple)
+    x = tuples_to_mid(x, idx2event, verbose=verbose)
+    return x
+def ind_tensor_to_str(x, idx2tuple, idx2event):
+    # Indices to string
+    x = ind_tensor_to_tuples(x, idx2tuple)
+    x = tuples_to_str(x, idx2event)
+    return x

midi_emotion/src/data/loader.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import numpy as np
+import random
+import torch
+from data.data_processing import transpose, tensor_to_ind_tensor
+from data.data_processing_reverse import tuples_to_str
+import sys
+sys.path.append("..")
+from utils import get_n_instruments
+import os
+"""
+Main data loader
+"""
+class Loader:
+    def __init__(self, data_folder, data, input_len, conditioning, save_input_dir=None, pad=True,
+                use_start_token=True, use_end_token=False, max_transpose=3, n_try=5,
+                bar_start_prob=0.5, debug=False, overfit=False, regression=False,
+                max_samples=None, min_n_instruments=3, use_cls_token=True,
+                always_use_discrete_condition=False):
+        self.data_folder = data_folder
+        self.bar_start_prob = bar_start_prob
+        self.save_input_dir = save_input_dir
+        self.input_len = input_len
+        self.n_try = n_try  # max number of trials to find suitable sample
+        self.min_n_instruments = min_n_instruments
+        self.overfit = overfit
+        self.one_sample = None
+        self.transpose_options = list(range(-max_transpose, max_transpose + 1))
+        self.conditioning = conditioning
+        self.regression = regression
+        self.use_cls_token = use_cls_token
+        self.pad = pad
+        self.always_use_discrete_condition = always_use_discrete_condition
+        self.pad_token = '<PAD>' if pad else None
+        self.start_token = '<START>' if use_start_token else None
+        self.end_token = '<END>' if use_end_token else None
+        self.cls_token = "<CLS>"
+        if debug or overfit:
+            data_folder = data_folder + "_debug"
+        self.data = data
+        data_files = os.listdir(self.data_folder)
+        self.data = [sample for sample in self.data if sample["file"] + '.pt' in data_files]
+        maps_file = os.path.join(os.path.abspath(data_folder + "/.."), "maps.pt")
+        self.maps = torch.load(maps_file)
+        extra_tokens = []
+        if self.conditioning == "continuous_token":
+            # two condition tokens will be concatenated later
+            self.input_len -= 2
+        elif self.conditioning == "discrete_token":
+            # add emotion tokens to mappings
+            for sample in self.data:
+                for label in ["valence", "arousal"]:
+                    token = sample[label]
+                    if token not in extra_tokens:
+                        extra_tokens.append(token)
+            extra_tokens = sorted(extra_tokens)
+        if self.regression and self.use_cls_token:
+            extra_tokens.append(self.cls_token)
+        if extra_tokens != []:
+            # add to maps
+            maps_list = list(self.maps["idx2tuple"].values())
+            maps_list += extra_tokens
+            self.maps["idx2tuple"] = {i: val for i, val in enumerate(maps_list)}
+            self.maps["tuple2idx"] = {val: i for i, val in enumerate(maps_list)}
+        if max_samples is not None and not debug and not overfit:
+            self.data = self.data[:max_samples]
+        # roughly / 256, but *4 for flexibility. it is later cut anyway
+        self.n_bars = max(round(input_len / 256 * 4), 1)
+    def get_vocab_len(self):
+        return len(self.maps["tuple2idx"])
+    def get_maps(self):
+        return self.maps
+    def get_pad_idx(self):
+        return self.maps["tuple2idx"][self.pad_token]
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        if not self.overfit or self.one_sample is None:
+            data_path = os.path.join(self.data_folder, self.data[idx]["file"] + ".pt")
+            item = torch.load(data_path)
+            all_bars = item["bars"]
+            n_instruments = 0
+            j = 0
+            while j < self.n_try and n_instruments < self.min_n_instruments:
+                # make sure to have n many instruments
+                # choose random bar
+                max_bar_start_idx = max(0, len(all_bars) - self.n_bars - 1)
+                bar_start_idx = random.randint(0, max_bar_start_idx)
+                bar_end_idx = min(len(all_bars), bar_start_idx + self.n_bars)
+                bars = all_bars[bar_start_idx:bar_end_idx]
+                # flatten
+                if bars != []:
+                    bars = torch.cat(bars, dim=0)
+                    symbols = tuples_to_str(bars.cpu().numpy(), self.maps["idx2event"])
+                    n_instruments = get_n_instruments(symbols)
+                else:
+                    n_instruments = 0
+                j += 1
+            if n_instruments < self.min_n_instruments:
+                return None, None, None
+            # transpose
+            if self.transpose_options != []:
+                n_transpose = random.choice(self.transpose_options)
+                bars = transpose(bars, n_transpose,
+                                self.maps["transposable_event_inds"])
+            # convert to indices (final input)
+            bars = tensor_to_ind_tensor(bars, self.maps["tuple2idx"])
+            # Decide taking the sample from the start of a bar or not
+            r = np.random.uniform()
+            start_at_beginning = not (r > self.bar_start_prob and bars.size(0) > self.input_len)
+            if start_at_beginning:
+                # starts exactly at bar location
+                if self.start_token is not None:
+                    # add start token
+                    start_idx = torch.ShortTensor(
+                        [self.maps["tuple2idx"][self.start_token]])
+                    bars = torch.cat((start_idx, bars), dim=0)
+            else:
+                # it doesn't have to start at bar location so shift arbitrarily
+                start = np.random.randint(0, bars.size(0)-self.input_len)
+                bars = bars[start:start+self.input_len+1]
+            if self.regression and self.use_cls_token:
+                # prepend <CLS> token
+                cls_idx = torch.ShortTensor(
+                    [self.maps["tuple2idx"][self.cls_token]])
+                bars = torch.cat((cls_idx, bars), 0)
+            # for now, no auxiliary conditions
+            condition = torch.FloatTensor([np.nan, np.nan])
+            if self.conditioning == "discrete_token" and \
+                (start_at_beginning or self.always_use_discrete_condition):
+                # add emotion tokens
+                valence, arousal = self.data[idx]["valence"], self.data[idx]["arousal"]
+                valence = torch.ShortTensor([self.maps["tuple2idx"][valence]])
+                arousal = torch.ShortTensor([self.maps["tuple2idx"][arousal]])
+                bars = torch.cat((valence, arousal, bars), dim=0)
+            elif self.conditioning in ("continuous_token", "continuous_concat") or self.regression:
+                # continuous conditions
+                condition = torch.FloatTensor([self.data[idx]["valence"], self.data[idx]["arousal"]])
+            bars = bars[:self.input_len + 1]    # trim to length, +1 to include target
+            if self.pad_token is not None:
+                n_pad = self.input_len + 1 - bars.shape[0]
+                if n_pad > 0:
+                    # pad if necessary
+                    bars = torch.nn.functional.pad(bars, (0, n_pad), value=self.get_pad_idx())
+            bars = bars.long()  # to int32
+            input_ = bars[:-1]
+            if self.regression:
+                target = None   # will use condition as target
+            else:
+                target = bars[1:]
+                if self.conditioning == "continuous_token":
+                    # pad target from left, because input will get conditions concatenated
+                    # their sizes should match
+                    target = torch.nn.functional.pad(target, (condition.size(0), 0), value=self.get_pad_idx())
+            if self.overfit:
+                self.one_sample = [input_, condition, target]
+        else:
+            # sanity check, using one sample repeatedly
+            input_, condition, target = self.one_sample
+        return input_, condition, target

midi_emotion/src/data/loader_exhaustive.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import numpy as np
+import torch
+from tqdm import tqdm
+from data.data_processing import  tensor_to_ind_tensor
+import sys
+sys.path.append("..")
+import os
+"""
+Loads ALL data for exhaustive evaluation
+"""
+class LoaderExhaustive:
+    def __init__(self, data_folder, data, input_len, conditioning, save_input_dir=None, pad=True,
+                use_start_token=True, use_end_token=False, always_use_discrete_condition=False,
+                debug=False, overfit=False, regression=False,
+                max_samples=None, use_cls_token=True):
+        self.data_folder = data_folder
+        self.save_input_dir = save_input_dir
+        self.input_len = input_len
+        self.overfit = overfit
+        self.one_sample = None
+        self.conditioning = conditioning
+        self.regression = regression
+        if debug or overfit:
+            data_folder = data_folder + "_debug"
+        self.data = data
+        maps_file = os.path.join(data_folder, "maps.pt")
+        self.maps = torch.load(maps_file)
+        self.pad_token = '<PAD>' if pad else None
+        self.start_token = '<START>' if use_start_token else None
+        self.end_token = '<END>' if use_end_token else None
+        self.cls_token = "<CLS>"
+        extra_tokens = []
+        if self.conditioning == "continuous_token":
+            # two condition tokens will be concatenated later
+            self.input_len -= 2
+        elif self.conditioning == "discrete_token":
+            # two condition tokens will be concatenated later
+            self.input_len -= 2
+            # add emotion tokens to mappings
+            for sample in self.data:
+                for label in ["valence", "arousal"]:
+                    token = sample[label]
+                    if token not in extra_tokens:
+                        extra_tokens.append(token)
+            extra_tokens = sorted(extra_tokens)
+        if self.regression and use_cls_token:
+            extra_tokens.append(self.cls_token)
+            self.input_len -= 1   # cls token
+        if self.regression:
+            chunk_len = self.input_len
+        else:
+            # +1 for target
+            chunk_len = self.input_len + 1
+        if extra_tokens != []:
+            # add to maps
+            maps_list = list(self.maps["idx2tuple"].values())
+            maps_list += extra_tokens
+            self.maps["idx2tuple"] = {i: val for i, val in enumerate(maps_list)}
+            self.maps["tuple2idx"] = {val: i for i, val in enumerate(maps_list)}
+        if max_samples is not None and not debug and not overfit:
+            self.data = self.data[:max_samples]
+        # Chunk entire data
+        chunked_data = []
+        print('Constructing data loader...')
+        for i in tqdm(range(len(self.data))):
+            data_path = os.path.join(data_folder, "lpd_5_full_transposable", self.data[i]["file"] + ".pt")
+            item = torch.load(data_path)
+            song = item["bars"]
+            if self.conditioning != 'none' or self.regression:
+                valence = self.data[i]["valence"]
+                arousal = self.data[i]["arousal"]
+            if self.conditioning in ("continuous_token", "continuous_concat") or self.regression:
+                condition = torch.FloatTensor([valence, arousal])
+            else:
+                condition = torch.FloatTensor([np.nan, np.nan])
+            song = torch.cat(song, 0)
+            song = tensor_to_ind_tensor(song, self.maps["tuple2idx"])
+            if self.start_token is not None:
+                # add start token
+                start_idx = torch.ShortTensor(
+                    [self.maps["tuple2idx"][self.start_token]])
+                song = torch.cat((start_idx, song), 0)
+            if self.conditioning == "discrete_token":
+                condition_tokens = torch.ShortTensor([
+                    self.maps["tuple2idx"][valence],
+                    self.maps["tuple2idx"][arousal]])
+                if not always_use_discrete_condition:
+                    song = torch.cat((condition_tokens, song), 0)
+            # split song into chunks
+            song = list(torch.split(song, chunk_len))  # +1 for target
+            if song[-1].size(0) != chunk_len:
+                song.pop(-1)
+            if self.regression and use_cls_token:
+                # prepend <CLS> token
+                cls_idx = torch.ShortTensor(
+                    [self.maps["tuple2idx"][self.cls_token]])
+                song = [torch.cat((cls_idx, x), 0) for x in song]
+            if self.conditioning == "discrete_token" and always_use_discrete_condition:
+                song = [torch.cat((condition_tokens, x), 0) for x in song]
+            song = [(x, condition) for x in song]
+            chunked_data += song
+        self.data = chunked_data
+        print('Data loader constructed.')
+    def get_vocab_len(self):
+        return len(self.maps["tuple2idx"])
+    def get_maps(self):
+        return self.maps
+    def get_pad_idx(self):
+        return self.maps["tuple2idx"][self.pad_token]
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        chunk, condition = self.data[idx]
+        chunk = chunk.long()
+        if self.regression:
+            input_ = chunk
+            target = None   # will use condition as target
+        else:
+            input_ = chunk[:-1]
+            target = chunk[1:]
+            if self.conditioning == "continuous_token":
+                # pad target from left, because input will get conditions concatenated
+                # their sizes should match
+                target = torch.nn.functional.pad(target, (condition.size(0), 0), value=self.get_pad_idx())
+        return input_, condition, target

midi_emotion/src/data/loader_generations.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from glob import glob
+import os
+from tkinter import TRUE
+import torch
+import sys
+sys.path.append("..")
+"""
+Data loader to perform regression on a folder with generations
+"""
+class LoaderGenerations:
+    def __init__(self, gen_folder, seq_len, pad=True, use_start_token=True, use_end_token=False,
+                use_cls_token=TRUE, overlap=0.5):
+        self.seq_len = seq_len
+        self.one_sample = None
+        self.pad = pad
+        self.pad_token = '<PAD>' if pad else None
+        self.start_token = '<START>' if use_start_token else None
+        self.end_token = '<END>' if use_end_token else None
+        self.cls_token = "<CLS>" if use_cls_token else None
+        data_paths = glob(os.path.join("../output", gen_folder, "*.pt"), recursive=True)
+        maps = torch.load("../datasets/lpd_5/w_emotion_transposable/maps.pt")
+        n_vocab = len(maps["tuple2idx"])
+        self.data = []
+        if self.cls_token is not None:
+            seq_len -= 1
+            if self.cls_token not in maps["tuple2idx"].keys():
+                # add <CLS> token to vobac
+                maps["tuple2idx"][self.cls_token] = len(maps["idx2tuple"])
+                maps["idx2tuple"][len(maps["idx2tuple"])] = self.cls_token
+            # prepend <CLS> token
+            cls_idx = torch.ShortTensor(
+                [maps["tuple2idx"][self.cls_token]])
+        for data_path in data_paths:
+            generation = torch.load(data_path)
+            inds = generation["inds"]
+            # remove special tokens
+            inds = inds[inds < n_vocab]
+            # split with overlap
+            inds = inds.unfold(0, seq_len, int(seq_len*(1-overlap)))
+            inds = list(torch.split(inds, 1, dim=0))
+            inds = [sample.squeeze() for sample in inds]
+            if self.cls_token is not None:
+                inds = [torch.cat((cls_idx, sample), dim=0) for sample in inds]
+            condition = generation["condition"]
+            if inds[-1].size(0) != seq_len:
+                inds.pop()
+            self.data += [(sample, condition) for sample in inds]
+        self.discrete2continuous = {
+            "-2": -0.8,
+            "-1": -0.4,
+            "0": 0,
+            "1": 0.4,
+            "2": 0.8
+        }
+    def get_vocab_len(self):
+        return None
+    def get_maps(self):
+        return None
+    def get_pad_idx(self):
+        return None
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        input_, condition = self.data[idx]
+        if input_.size(0) != self.seq_len:
+            Warning(f"Input length is {input_.size(0)}")
+            return None, None, None
+        if isinstance(condition[0], str):
+            condition = condition[:2]
+            for i in range(len(condition)):
+                condition[i] = self.discrete2continuous[condition[i][2:-1]]
+            condition = torch.Tensor(condition)
+        input_ = input_.cpu()
+        condition = condition.cpu()
+        return input_, condition, None

midi_emotion/src/data/preprocess_features.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import pandas as pd
+import numpy as np
+def preprocess_features(feature_file, n_bins=None, min_n_instruments=3,
+        test_ratio=0.05, outlier_range=1.5, conditional=True,
+        use_labeled_only=True):
+    # Preprocess data
+    data = pd.read_csv(feature_file)
+    mapper = {"valence": "valence", "note_density_per_instrument": "arousal"}
+    data = data.rename(columns=mapper)
+    columns = data.columns.to_list()
+    # filter out ones with less instruments
+    data = data[data["n_instruments"] >= min_n_instruments]
+    # filter out ones with zero valence
+    data = data[data["valence"] != 0]
+    # filter out outliers
+    feature_labels = list(mapper.values())
+    outlier_indices = []
+    for label in feature_labels:
+        series = data[label]
+        q1 = series.quantile(0.25)
+        q3 = series.quantile(0.75)
+        iqr = q3 - q1
+        upper_limit = q3 + outlier_range * iqr
+        lower_limit = q1 - outlier_range * iqr
+        outlier_indices += series[series < lower_limit].index.to_list()
+        outlier_indices += series[series > upper_limit].index.to_list()
+    data.drop(outlier_indices, inplace=True)
+    # shift and scale features between -1 and 1
+    for label in feature_labels:
+        series = data[label]
+        min_ = series.min()
+        max_ = series.max()
+        data[label] = (data[label] - min_) / (max_ - min_) * 2 - 1
+    if n_bins is not None:
+        # digitize into bins using quantiles
+        quantile_indices = np.linspace(0, 1, n_bins+1)
+        for label in feature_labels:
+            # create token labels
+            if n_bins % 2 == 0:
+                bin_ids = list(range(-n_bins//2, 0)) + list(range(1, n_bins//2+1))
+            else:
+                bin_ids = list(range(-(n_bins-1)//2, (n_bins-1)//2 + 1))
+            token_labels = ["<{}{}>".format(label[0].upper(), bin_id) \
+                for bin_id in bin_ids]
+            # additional label for NaN (missing) values: <V>
+            token_labels.append(None)   # to handle NaNs
+            series = data[label]
+            quantiles = [series.quantile(q) for q in quantile_indices]
+            quantiles[-1] += 1e-6
+            series = series.to_numpy()
+            series_digitized = np.digitize(series, quantiles)
+            series_tokenized = [token_labels[i-1] for i in series_digitized]
+            data[label] = series_tokenized
+    else:
+        # convert NaN into None
+        data = data.where(pd.notnull(data), None)
+    # Create train and test splits
+    matched = data[data["is_matched"]]
+    unmatched = data[~data["is_matched"]]
+    # reserve a portion of matched data for testing
+    matched = matched.sort_values("file")
+    matched = matched.reset_index(drop=True)
+    n_test_samples = round(len(matched) * test_ratio)
+    test_split = matched.loc[len(matched)-n_test_samples:len(matched)]
+    train_split = matched.loc[:len(matched)-n_test_samples]
+    if not use_labeled_only:
+        train_split = pd.concat([train_split, unmatched])
+        train_split = train_split.sort_values("file").reset_index(drop=True)
+    splits = [train_split, test_split]
+    # summarize
+    columns_to_drop = [col for col in columns if col not in ["file", "valence", "arousal"]]
+    if not conditional:
+        columns_to_drop += ["valence", "arousal"]
+    # filter data so all features are valid (not None = matched data)
+    for label in feature_labels:
+        # test split has to be identical across vanilla and conditional models
+        splits[1] = splits[1][~splits[1][label].isnull()]
+        # filter train split only for conditional models
+        if use_labeled_only:
+            splits[0] = splits[0][~splits[0][label].isnull()]
+    for i in range(len(splits)):
+        # summarize
+        splits[i] = splits[i].drop(columns=columns_to_drop, errors="ignore")
+        splits[i] = splits[i].to_dict("records")
+    return splits

midi_emotion/src/data/preprocess_pianorolls.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import json
+from data_processing import read_pianoroll, mid_to_bars, get_maps
+import torch
+import pandas as pd
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+import time
+from functools import partial
+import os
+""" Preprocessing Lakh MIDI pianoroll dataset.
+Divides into bars. Encodes into tuples. Makes transposing easier. """
+def run(f, my_iter):
+    with ProcessPoolExecutor(max_workers=16) as executor:
+        results = list(tqdm(executor.map(f, my_iter), total=len(my_iter)))
+    return results
+def get_emotion_dict(path):
+    table = pd.read_csv(path)
+    table = table.to_dict(orient="records")
+    table = {item["path"].split("/")[-2]: \
+                 {"valence": item["valence"], "energy": item["energy"], "tempo": item["tempo"]} \
+                     for item in table}
+    return table
+def process(pr_path, event_sym2idx):
+    time.sleep(0.001)
+    mid = read_pianoroll(pr_path)
+    bars = mid_to_bars(mid, event_sym2idx)
+    file_ = pr_path.split("/")[-1]
+    item_data = {
+                "file": file_,
+                "bars": bars,
+                 }
+    return item_data
+def main():
+    main_dir = "../../data_files/lpd_5"
+    input_dir = "../../data_files/lpd_5/lpd_5_full"
+    unique_pr_list_file = "../../data_files/features/pianoroll/unique_files.json"
+    output_dir = os.path.join(main_dir, "lpd_5_full_transposable")
+    os.makedirs(output_dir, exist_ok=True)
+    output_maps_path = os.path.join(main_dir, "maps.pt")
+    with open(unique_pr_list_file, "r") as f:
+        pr_paths = json.load(f)
+    pr_paths = [os.path.join(input_dir, pr_path[0], pr_path + ".npz") for pr_path in pr_paths]
+    maps = get_maps()
+    func = partial(process, event_sym2idx=maps["event2idx"])
+    os.makedirs(output_dir, exist_ok=True)
+    x = run(func, pr_paths)
+    x = [item for item in x if item["bars"] is not None]
+    for i in tqdm(range(len(x))):
+        for j in range(len(x[i]["bars"])):
+            x[i]["bars"][j] = torch.from_numpy(x[i]["bars"][j])
+        fname = x[i]["file"]
+        output_path = os.path.join(output_dir, fname.replace(".npz", ".pt"))
+        torch.save(x[i], output_path)
+    torch.save(maps, output_maps_path)
+if __name__ == "__main__":
+    main()

midi_emotion/src/generate.py ADDED Viewed

	@@ -0,0 +1,403 @@

+from argparse import ArgumentParser
+from copy import deepcopy
+import os
+import sys
+import numpy as np
+import torch
+import torch.nn.functional as F
+import datetime
+from tqdm import tqdm
+from .utils import get_n_instruments
+from .models.build_model import build_model
+from .data.data_processing_reverse import ind_tensor_to_mid, ind_tensor_to_str
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+def generate(model, maps, device, out_dir, conditioning, short_filename=False,
+                penalty_coeff=0.5, discrete_conditions=None, continuous_conditions=None,
+                    max_input_len=1024, amp=True, step=None,
+                    gen_len=2048, temperatures=[1.2,1.2], top_k=-1,
+                    top_p=0.7, debug=False, varying_condition=None, seed=-1,
+                    verbose=False, primers=[["<START>"]], min_n_instruments=2):
+    if not debug:
+        os.makedirs(out_dir, exist_ok=True)
+    model = model.to(device)
+    model.eval()
+    assert len(temperatures) in (1, 2)
+    if varying_condition is not None:
+        batch_size = varying_condition[0].size(0)
+    else:
+        try:
+            continuous_conditions = torch.FloatTensor(continuous_conditions).to(device)
+        except:
+            continuous_conditions = None
+        if conditioning == "none":
+            batch_size = len(primers)
+        elif conditioning == "discrete_token":
+            assert discrete_conditions is not None
+            discrete_conditions_tensor = [[maps["tuple2idx"][symbol] for symbol in condition_sample] \
+                for condition_sample in discrete_conditions]
+            discrete_conditions_tensor = torch.LongTensor(discrete_conditions_tensor).t().to(device)
+            batch_size = discrete_conditions_tensor.size(1)
+        elif conditioning in ("continuous_token", "continuous_concat"):
+            batch_size = len(continuous_conditions)
+    # will be used to penalize repeats
+    repeat_counts = [0 for _ in range(batch_size)]
+    exclude_symbols = [symbol for symbol in maps["tuple2idx"].keys() if symbol[0] == "<"]
+    # will have generated symbols and indices
+    gen_song_tensor = torch.LongTensor([]).to(device)
+    if not isinstance(primers, list):
+        primers = [[primers]]
+    primer_inds = [[maps["tuple2idx"][symbol] for symbol in primer] \
+        for primer in primers]
+    gen_inds = torch.LongTensor(primer_inds)
+    null_conditions_tensor = torch.FloatTensor([np.nan, np.nan]).to(device)
+    if len(primers) == 1:
+        gen_inds = gen_inds.repeat(batch_size, 1)
+        null_conditions_tensor = null_conditions_tensor.repeat(batch_size, 1)
+    if conditioning == "continuous_token":
+        max_input_len -= 2
+        conditions_tensor = continuous_conditions
+    elif conditioning == "continuous_concat":
+        conditions_tensor = continuous_conditions
+    elif conditioning == "discrete_token":
+        max_input_len -= discrete_conditions_tensor.size(0)
+        conditions_tensor = null_conditions_tensor
+    else:
+        conditions_tensor = null_conditions_tensor
+    if varying_condition is not None:
+        varying_condition[0] = varying_condition[0].to(device)
+        varying_condition[1] = varying_condition[1].to(device)
+    gen_inds = gen_inds.t().to(device)
+    with torch.no_grad():
+        pbar = tqdm(total=gen_len, desc="Generating tokens", leave=True)
+        i = 0
+        while i < gen_len:
+            i += 1
+            pbar.update(1)
+            gen_song_tensor = torch.cat((gen_song_tensor, gen_inds), 0)
+            input_ = gen_song_tensor
+            if len(gen_song_tensor) > max_input_len:
+                input_ = input_[-max_input_len:, :]
+            if conditioning == "discrete_token":
+                # concat with conditions
+                input_ = torch.cat((discrete_conditions_tensor, input_), 0)
+            # INTERPOLATED CONDITIONS
+            if varying_condition is not None:
+                valences = varying_condition[0][:, i-1]
+                arousals = varying_condition[1][:, i-1]
+                conditions_tensor = torch.cat([valences[:, None], arousals[:, None]], dim=-1)
+            # Run model
+            with torch.cuda.amp.autocast(enabled=amp):
+                input_ = input_.t()
+                output = model(input_, conditions_tensor)
+                output = output.permute((1, 0, 2))
+            # Process output, get predicted token
+            output = output[-1, :, :]     # Select last timestep
+            output[output != output] = 0    # zeroing nans
+            if torch.all(output == 0) and verbose:
+                # if everything becomes zero
+                print("All predictions were NaN during generation")
+                output = torch.ones(output.shape).to(device)
+            # exclude certain symbols
+            for symbol_exclude in exclude_symbols:
+                try:
+                    idx_exclude = maps["tuple2idx"][symbol_exclude]
+                    output[:, idx_exclude] = -float("inf")
+                except:
+                    pass
+            effective_temps = []
+            for j in range(batch_size):
+                gen_idx = gen_inds[0, j].item()
+                gen_tuple = maps["idx2tuple"][gen_idx]
+                effective_temp = temperatures[1]
+                if isinstance(gen_tuple, tuple):
+                    gen_event = maps["idx2event"][gen_tuple[0]]
+                    if "TIMESHIFT" in gen_event:
+                        # switch from rest temperature to note temperature
+                        effective_temp = temperatures[0]
+                effective_temps.append(effective_temp)
+            temp_tensor = torch.Tensor([effective_temps]).to(device)
+            output = F.log_softmax(output, dim=-1)
+            # Add repeat penalty to temperature
+            if penalty_coeff > 0:
+                repeat_counts_array = torch.Tensor(repeat_counts).to(device)
+                temp_multiplier = torch.maximum(torch.zeros_like(repeat_counts_array, device=device),
+                    torch.log((repeat_counts_array+1)/4)*penalty_coeff)
+                repeat_penalties = temp_multiplier * temp_tensor
+                temp_tensor += repeat_penalties
+            # Apply temperature
+            output /= temp_tensor.t()
+            # top-k
+            if top_k <= 0 or top_k > output.size(-1):
+                top_k_eff = output.size(-1)
+            else:
+                top_k_eff = top_k
+            output, top_inds = torch.topk(output, top_k_eff)
+            # top-p
+            if top_p > 0 and top_p < 1:
+                cumulative_probs = torch.cumsum(F.softmax(output, dim=-1), dim=-1)
+                remove_inds = cumulative_probs > top_p
+                remove_inds[:, 0] = False   # at least keep top value
+                output[remove_inds] = -float("inf")
+            output = F.softmax(output, dim=-1)
+            # Sample from probabilities
+            inds_sampled = torch.multinomial(output, 1, replacement=True)
+            gen_inds = top_inds.gather(1, inds_sampled).t()
+            # Update repeat counts
+            num_choices = torch.sum((output > 0).int(), -1)
+            for j in range(batch_size):
+                if num_choices[j] <= 2: repeat_counts[j] += 1
+                else: repeat_counts[j] = repeat_counts[j] // 2
+        pbar.close()
+        # Convert to midi and save
+        print("\nConverting to MIDI...")
+        # If there are less than n instruments, repeat generation for specific condition
+        redo_primers, redo_discrete_conditions, redo_continuous_conditions = [], [], []
+        for i in range(gen_song_tensor.size(-1)):
+            if short_filename:
+                out_file_path = f"{i}"
+            else:
+                if step is None:
+                    now = datetime.datetime.now()
+                    out_file_path = now.strftime("%Y_%m_%d_%H_%M_%S")
+                else:
+                    out_file_path = step
+                out_file_path += f"_{i}"
+            if seed > 0:
+                out_file_path += f"_s{seed}"
+            if continuous_conditions is not None:
+                condition = continuous_conditions[i, :].tolist()
+                # convert to string
+                condition = [str(round(c, 2)).replace(".", "") for c in condition]
+                out_file_path += f"_V{condition[0]}_A{condition[1]}"
+            out_file_path += ".mid"
+            out_path_mid = os.path.join(out_dir, out_file_path)
+            symbols = ind_tensor_to_str(gen_song_tensor[:, i], maps["idx2tuple"], maps["idx2event"])
+            n_instruments = get_n_instruments(symbols)
+            if n_instruments >= min_n_instruments:
+                mid = ind_tensor_to_mid(gen_song_tensor[:, i], maps["idx2tuple"], maps["idx2event"], verbose=False)
+                out_path_txt = "txt_" + out_file_path.replace(".mid", ".txt")
+                out_path_txt = os.path.join(out_dir, out_path_txt)
+                out_path_inds = "inds_" + out_file_path.replace(".mid", ".pt")
+                out_path_inds = os.path.join(out_dir, out_path_inds)
+                if not debug:
+                    mid.write(out_path_mid)
+                    if verbose:
+                        print(f"Saved to {out_path_mid}")
+            else:
+                print(f"Only has {n_instruments} instruments, not saving.")
+                if conditioning == "none":
+                    redo_primers.append(primers[i])
+                    redo_discrete_conditions = None
+                    redo_continuous_conditions = None
+                elif conditioning == "discrete_token":
+                    redo_discrete_conditions.append(discrete_conditions[i])
+                    redo_continuous_conditions = None
+                    redo_primers = primers
+                else:
+                    redo_discrete_conditions = None
+                    redo_continuous_conditions.append(continuous_conditions[i, :].tolist())
+                    redo_primers = primers
+    return redo_primers, redo_discrete_conditions, redo_continuous_conditions
+if __name__ == '__main__':
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    code_model_dir = os.path.abspath(os.path.join(script_dir, 'model'))
+    code_utils_dir = os.path.join(code_model_dir, 'utils')
+    sys.path.extend([code_model_dir, code_utils_dir])
+    parser = ArgumentParser()
+    parser.add_argument('--model_dir', type=str, help='Directory with model', required=True)
+    parser.add_argument('--no_cuda', action='store_true', help="Use CPU")
+    parser.add_argument('--num_runs', type=int, help='Number of runs', default=1)
+    parser.add_argument('--gen_len', type=int, help='Max generation len', default=4096)
+    parser.add_argument('--max_input_len', type=int, help='Max input len', default=1216)
+    parser.add_argument('--temp', type=float, nargs='+', help='Generation temperature', default=[1.2, 1.2])
+    parser.add_argument('--topk', type=int, help='Top-k sampling', default=-1)
+    parser.add_argument('--topp', type=float, help='Top-p sampling', default=0.7)
+    parser.add_argument('--debug', action='store_true', help="Do not save anything")
+    parser.add_argument('--seed', type=int, default=0, help="Random seed")
+    parser.add_argument('--no_amp', action='store_true', help="Disable automatic mixed precision")
+    parser.add_argument("--conditioning", type=str, required=True,
+                    choices=["none", "discrete_token", "continuous_token",
+                             "continuous_concat"], help='Conditioning type')
+    parser.add_argument('--penalty_coeff', type=float, default=0.5,
+                        help="Coefficient for penalizing repeating notes")
+    parser.add_argument("--quiet", action='store_true', help="Not verbose")
+    parser.add_argument("--short_filename", action='store_true')
+    parser.add_argument('--batch_size', type=int, help='Batch size', default=4)
+    parser.add_argument('--min_n_instruments', type=int, help='Minimum number of instruments', default=1)
+    parser.add_argument('--valence', type=float, help='Conditioning valence value', default=[None], nargs='+')
+    parser.add_argument('--arousal', type=float, help='Conditioning arousal value', default=[None], nargs='+')
+    parser.add_argument("--batch_gen_dir", type=str, default="")
+    args = parser.parse_args()
+    assert len(args.valence) == len(args.arousal), "Lengths of valence and arousal must be equal"
+    assert (args.conditioning == "none") == (args.valence == [None] or args.arousal == [None]), \
+        "If conditioning is used, specify valence and arousal; if not, don't"
+    if args.seed > 0:
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed(args.seed)
+    main_output_dir = "../output"
+    assert os.path.exists(os.path.join(main_output_dir, args.model_dir))
+    midi_output_dir = os.path.join(main_output_dir, args.model_dir, "generations", "inference")
+    new_dir = ""
+    if args.batch_gen_dir != "":
+        new_dir = new_dir + "_" + args.batch_gen_dir
+    if new_dir != "":
+        midi_output_dir = os.path.join(midi_output_dir, new_dir)
+    if not args.debug:
+        os.makedirs(midi_output_dir, exist_ok=True)
+    model_fp = os.path.join(main_output_dir, args.model_dir, 'model.pt')
+    mappings_fp = os.path.join(main_output_dir, args.model_dir, 'mappings.pt')
+    config_fp = os.path.join(main_output_dir, args.model_dir, 'model_config.pt')
+    if os.path.exists(mappings_fp):
+        maps = torch.load(mappings_fp)
+    else:
+        raise ValueError("Mapping file not found.")
+    start_symbol = "<START>"
+    n_emotion_bins = 5
+    valence_symbols, arousal_symbols = [], []
+    emotion_bins = np.linspace(-1-1e-12, 1+1e-12, num=n_emotion_bins+1)
+    if n_emotion_bins % 2 == 0:
+        bin_ids = list(range(-n_emotion_bins//2, 0)) + list(range(1, n_emotion_bins//2+1))
+    else:
+        bin_ids = list(range(-(n_emotion_bins-1)//2, (n_emotion_bins-1)//2 + 1))
+    for bin_id in bin_ids:
+        valence_symbols.append(f"<V{bin_id}>")
+        arousal_symbols.append(f"<A{bin_id}>")
+    device = torch.device('cuda' if not args.no_cuda and torch.cuda.is_available() else 'cpu')
+    verbose = not args.quiet
+    if verbose:
+        if device == torch.device("cuda"):
+            print("Using GPU")
+        else:
+            print("Using CPU")
+    # Load model
+    config = torch.load(config_fp)
+    model, _ = build_model(None, load_config_dict=config)
+    model = model.to(device)
+    if os.path.exists(model_fp):
+        model.load_state_dict(torch.load(model_fp, map_location=device))
+    elif os.path.exists(model_fp.replace("best_", "")):
+        model.load_state_dict(torch.load(model_fp.replace("best_", ""), map_location=device))
+    else:
+        raise ValueError("Model not found")
+    # Process conditions
+    null_condition = torch.FloatTensor([np.nan, np.nan]).to(device)
+    varying_condition = None
+    label_conditions = None
+    conditions = []
+    if args.valence == [None]:
+        conditions = None
+    elif len(args.valence) == 1:
+        for _ in range(args.batch_size):
+            conditions.append([args.valence[0], args.arousal[0]])
+    else:
+        for i in range(len(args.valence)):
+            conditions.append([args.valence[i], args.arousal[i]])
+    primers = [["<START>"]]
+    continuous_conditions = conditions
+    if args.conditioning == "discrete_token":
+        discrete_conditions = []
+        for condition in conditions:
+            valence_val, arousal_val = condition
+            valence_symbol = valence_symbols[np.searchsorted(
+                emotion_bins, valence_val, side="right") - 1]
+            arousal_symbol = arousal_symbols[np.searchsorted(
+                emotion_bins, arousal_val, side="right") - 1]
+            discrete_conditions.append([valence_symbol, arousal_symbol])
+        conditions = null_condition
+    elif args.conditioning == "none":
+        discrete_conditions = None
+        primers = [["<START>"] for _ in range(args.batch_size)]
+    elif args.conditioning in ["continuous_token", "continuous_concat"]:
+        primers = [["<START>"]]
+        discrete_conditions = None
+    for i in range(args.num_runs):
+        primers_run = deepcopy(primers)
+        discrete_conditions_run = deepcopy(discrete_conditions)
+        continuous_conditions_run = deepcopy(continuous_conditions)
+        while not (primers_run == [] or discrete_conditions_run == [] or continuous_conditions_run == []):
+            primers_run, discrete_conditions_run, continuous_conditions_run = generate(
+                        model, maps, device,
+                        midi_output_dir, args.conditioning, discrete_conditions=discrete_conditions_run,
+                        min_n_instruments=args.min_n_instruments,continuous_conditions=continuous_conditions_run,
+                        penalty_coeff=args.penalty_coeff, short_filename=args.short_filename, top_p=args.topp,
+                        gen_len=args.gen_len, max_input_len=args.max_input_len,
+                        amp=not args.no_amp, primers=primers_run, temperatures=args.temp, top_k=args.topk,
+                        debug=args.debug, verbose=not args.quiet, seed=args.seed)

midi_emotion/src/models/build_model.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch.nn as nn
+def set_dropout(model, rate):
+    for name, child in model.named_children():
+        if isinstance(child, nn.Dropout):
+            child.p = rate
+        set_dropout(child, rate)
+    return model
+def build_model(args, load_config_dict=None):
+    if load_config_dict is not None:
+        args = load_config_dict
+    config = {
+        "vocab_size": args["vocab_size"],
+        "num_layer": args["n_layer"],
+        "num_head": args["n_head"],
+        "embedding_dim": args["d_model"],
+        "d_inner": args["d_inner"],
+        "dropout": args["dropout"],
+        "d_condition": args["d_condition"],
+        "max_seq": 2048,
+        "pad_token": 0,
+    }
+    if not "regression" in list(args.keys()):
+        args["regression"] = False
+    if args["regression"]:
+        config["output_size"] = 2
+        from models.music_regression \
+                import MusicRegression as MusicTransformer
+    elif args["conditioning"] == "continuous_token":
+        from models.music_continuous_token \
+                import MusicTransformerContinuousToken as MusicTransformer
+        del config["d_condition"]
+    else:
+        from .music_multi \
+                import MusicTransformerMulti as MusicTransformer
+    model = MusicTransformer(**config)
+    if load_config_dict is not None and args is not None:
+        if args["overwrite_dropout"]:
+            model = set_dropout(model, args["dropout"])
+            rate = args["dropout"]
+            print(f"Dropout rate changed to {rate}")
+    return model, args

midi_emotion/src/models/music_continuous_token.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import torch
+import math as m
+import numpy as np
+import math
+import torch.nn.functional as F
+"""
+MUSIC TRANSFORMER
+CONTINUOUS TOKEN
+Takes continuous conditions separately, embeds them and
+then inserts them before the embedded sequence
+Hence, they are like continuous tokens
+"""
+def generate_mask(x, pad_token=None, batch_first=True):
+    batch_size = x.size(0)
+    seq_len = x.size(1)
+    subsequent_mask = torch.logical_not(torch.triu(torch.ones(seq_len, seq_len, device=x.device)).t()).unsqueeze(
+        -1).repeat(1, 1, batch_size)
+    pad_mask = x == pad_token
+    if batch_first:
+        pad_mask = pad_mask.t()
+    mask = torch.logical_or(subsequent_mask, pad_mask)
+    if batch_first:
+        mask = mask.permute(2, 0, 1)
+    return mask
+class MusicTransformerContinuousToken(torch.nn.Module):
+    def __init__(self, embedding_dim=None, d_inner=None, vocab_size=None, num_layer=None, num_head=None,
+                 max_seq=None, dropout=None, pad_token=None, has_start_token=True, n_conditions=2,
+                 ):
+        super().__init__()
+        self.max_seq = max_seq
+        self.num_layer = num_layer
+        self.embedding_dim = embedding_dim
+        self.vocab_size = vocab_size
+        self.pad_token = pad_token
+        self.has_start_token = has_start_token
+        self.n_conditions = n_conditions
+        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
+                                            embedding_dim=self.embedding_dim,
+                                            padding_idx=pad_token)
+        # two vectors for two types of emotion (valence, energy/tempo)
+        # just like token embedding
+        self.fc_condition = torch.nn.ModuleList([torch.nn.Linear(1, self.embedding_dim) \
+                                                 for _ in range(self.n_conditions)])
+        self.pos_encoding = DynamicPositionEmbedding(self.embedding_dim, max_seq=max_seq)
+        self.enc_layers = torch.nn.ModuleList(
+            [EncoderLayer(embedding_dim, d_inner, dropout, h=num_head, additional=False, max_seq=max_seq)
+             for _ in range(num_layer)])
+        self.dropout = torch.nn.Dropout(dropout)
+        self.fc = torch.nn.Linear(self.embedding_dim, self.vocab_size)
+        self.init_weights()
+    def init_weights(self):
+        initrange = 0.1
+        self.embedding.weight.data.uniform_(-initrange, initrange)
+        self.fc.bias.data.zero_()
+        self.fc.weight.data.uniform_(-initrange, initrange)
+        for i in range(len(self.fc_condition)):
+            self.fc_condition[i].weight.data.uniform_(-initrange, initrange)
+            self.fc_condition[i].bias.data.zero_()
+    def forward(self, x_tokens, condition):
+        # takes batch first
+        # x.shape = [batch_size, sequence_length]
+        # embed input
+        x = self.embedding(x_tokens)  # (batch_size, input_seq_len, d_model)
+        x *= math.sqrt(self.embedding_dim)
+        # pad input sequence to represent continuous emotion vectors
+        x_tokens_padded = torch.nn.functional.pad(x_tokens, (condition.size(-1), 0), value=-1)
+        mask = generate_mask(x_tokens_padded, self.pad_token)
+        # embed conditions one by one, using different linear layers,
+        # just like token embedding
+        c = []
+        for i in range(self.n_conditions):
+            c.append(self.fc_condition[i](condition[:, i, None]))
+        c = torch.stack(c, dim=1)
+        # concatenate with conditions
+        x = torch.cat((c, x), dim=1)
+        x = self.pos_encoding(x)
+        x = self.dropout(x)
+        for i in range(len(self.enc_layers)):
+            x = self.enc_layers[i](x, mask)
+        x = self.fc(x)
+        return x
+class EncoderLayer(torch.nn.Module):
+    def __init__(self, d_model, d_inner, rate=0.1, h=16, additional=False, max_seq=2048):
+        super(EncoderLayer, self).__init__()
+        self.d_model = d_model
+        self.rga = RelativeGlobalAttention(h=h, d=d_model, max_seq=max_seq, add_emb=additional)
+        self.FFN_pre = torch.nn.Linear(self.d_model, d_inner)
+        self.FFN_suf = torch.nn.Linear(d_inner, self.d_model)
+        self.layernorm1 = torch.nn.LayerNorm(self.d_model, eps=1e-6)
+        self.layernorm2 = torch.nn.LayerNorm(self.d_model, eps=1e-6)
+        self.dropout1 = torch.nn.Dropout(rate)
+        self.dropout2 = torch.nn.Dropout(rate)
+    def forward(self, x, mask=None, **kwargs):
+        attn_out = self.rga([x,x,x], mask)
+        attn_out = self.dropout1(attn_out)
+        out1 = self.layernorm1(attn_out+x)
+        ffn_out = F.relu(self.FFN_pre(out1))
+        ffn_out = self.FFN_suf(ffn_out)
+        ffn_out = self.dropout2(ffn_out)
+        out2 = self.layernorm2(out1+ffn_out)
+        return out2
+def sinusoid(max_seq, embedding_dim):
+    return np.array([[
+        [
+            m.sin(
+                pos * m.exp(-m.log(10000) * i / embedding_dim) * m.exp(
+                    m.log(10000) / embedding_dim * (i % 2)) + 0.5 * m.pi * (i % 2)
+            )
+            for i in range(embedding_dim)
+        ]
+        for pos in range(max_seq)
+    ]])
+def sinusoid2(max_seq, embedding_dim):
+    pos_emb = np.zeros((1, max_seq, embedding_dim))
+    for index in range(0, embedding_dim, 2):
+        pos_emb[0, :, index] = np.array([m.sin(pos/10000**(index/embedding_dim))
+                                      for pos in range(max_seq)])
+        pos_emb[0, :, index+1] = np.array([m.cos(pos/10000**(index/embedding_dim))
+                                       for pos in range(max_seq)])
+    return pos_emb
+class DynamicPositionEmbedding(torch.nn.Module):
+    def __init__(self, embedding_dim, max_seq=2048):
+        super().__init__()
+        self.device = torch.device("cpu")
+        self.dtype = torch.float32
+        embed_sinusoid_list = sinusoid(max_seq, embedding_dim)
+        self.positional_embedding = torch.from_numpy(embed_sinusoid_list).to(
+            self.device, dtype=self.dtype)
+    def forward(self, x):
+        if x.device != self.device or x.dtype != self.dtype:
+            self.positional_embedding = self.positional_embedding.to(x.device, dtype=x.dtype)
+        x += self.positional_embedding[:, :x.size(1), :]
+        return x
+class RelativeGlobalAttention(torch.nn.Module):
+    """
+    from Music Transformer ( Huang et al, 2018 )
+    [paper link](https://arxiv.org/pdf/1809.04281.pdf)
+    """
+    def __init__(self, h=4, d=256, add_emb=False, max_seq=2048, **kwargs):
+        super().__init__()
+        self.len_k = None
+        self.max_seq = max_seq
+        self.E = None
+        self.h = h
+        self.d = d
+        self.dh = d // h
+        self.Wq = torch.nn.Linear(self.d, self.d)
+        self.Wk = torch.nn.Linear(self.d, self.d)
+        self.Wv = torch.nn.Linear(self.d, self.d)
+        self.fc = torch.nn.Linear(d, d)
+        self.additional = add_emb
+        self.E = torch.nn.Parameter(torch.randn([self.max_seq, int(self.dh)]))
+        if self.additional:
+            self.Radd = None
+    def forward(self, inputs, mask=None, **kwargs):
+        """
+        :param inputs: a list of tensors. i.e) [Q, K, V]
+        :param mask: mask tensor
+        :param kwargs:
+        :return: final tensor ( output of attention )
+        """
+        q = inputs[0]
+        q = self.Wq(q)
+        q = torch.reshape(q, (q.size(0), q.size(1), self.h, -1))
+        q = q.permute(0, 2, 1, 3)  # batch, h, seq, dh
+        k = inputs[1]
+        k = self.Wk(k)
+        k = torch.reshape(k, (k.size(0), k.size(1), self.h, -1))
+        k = k.permute(0, 2, 1, 3)
+        v = inputs[2]
+        v = self.Wv(v)
+        v = torch.reshape(v, (v.size(0), v.size(1), self.h, -1))
+        v = v.permute(0, 2, 1, 3)
+        self.len_k = k.size(2)
+        self.len_q = q.size(2)
+        E = self._get_left_embedding(self.len_q, self.len_k).to(q.device)
+        QE = torch.einsum('bhld,md->bhlm', [q, E])
+        QE = self._qe_masking(QE)
+        Srel = self._skewing(QE)
+        Kt = k.permute(0, 1, 3, 2)
+        QKt = torch.matmul(q, Kt)
+        logits = QKt + Srel
+        logits = logits / math.sqrt(self.dh)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            new_mask = torch.zeros_like(mask, dtype=torch.float)
+            new_mask.masked_fill_(mask, float("-inf"))
+            mask = new_mask
+            logits += mask
+        attention_weights = F.softmax(logits, -1)
+        attention = torch.matmul(attention_weights, v)
+        out = attention.permute(0, 2, 1, 3)
+        out = torch.reshape(out, (out.size(0), -1, self.d))
+        out = self.fc(out)
+        return out
+    def _get_left_embedding(self, len_q, len_k):
+        starting_point = max(0,self.max_seq-len_q)
+        e = self.E[starting_point:,:]
+        return e
+    def _skewing(self, tensor: torch.Tensor):
+        padded = F.pad(tensor, [1, 0, 0, 0, 0, 0, 0, 0])
+        reshaped = torch.reshape(padded, shape=[padded.size(0), padded.size(1), padded.size(-1), padded.size(-2)])
+        Srel = reshaped[:, :, 1:, :]
+        if self.len_k > self.len_q:
+            Srel = F.pad(Srel, [0, 0, 0, 0, 0, 0, 0, self.len_k-self.len_q])
+        elif self.len_k < self.len_q:
+            Srel = Srel[:, :, :, :self.len_k]
+        return Srel
+    @staticmethod
+    def _qe_masking(qe):
+        mask = sequence_mask(
+            torch.arange(qe.size()[-1] - 1, qe.size()[-1] - qe.size()[-2] - 1, -1).to(qe.device),
+            qe.size()[-1])
+        mask = ~mask.to(mask.device)
+        return mask.to(qe.dtype) * qe
+def sequence_mask(length, max_length=None):
+    """Tensorflow의 sequence_mask를 구현"""
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)

midi_emotion/src/models/music_multi.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import torch
+import math as m
+import numpy as np
+import math
+import torch.nn.functional as F
+import sys
+sys.path.append("..")
+"""
+MUSIC TRANSFORMER
+Multi use, can handle following conditioning methods:
+none (vanilla), continuous_concat, discrete_token
+CONTINUOUS CONCAT
+Takes continuous conditions as a vector of length 2, embeds it and
+then concatenates it with every input token
+If d_condition <= 0, it become VANILLA music transformer
+If d_condition <= 0 and discrete condition tokens are fed,
+    it becomes "DISCRETE TOKEN" music transformer
+"""
+def generate_mask(x, pad_token=None, batch_first=True):
+    batch_size = x.size(0)
+    seq_len = x.size(1)
+    subsequent_mask = torch.logical_not(torch.triu(torch.ones(seq_len, seq_len, device=x.device)).t()).unsqueeze(
+        -1).repeat(1, 1, batch_size)
+    pad_mask = x == pad_token
+    if batch_first:
+        pad_mask = pad_mask.t()
+    mask = torch.logical_or(subsequent_mask, pad_mask)
+    if batch_first:
+        mask = mask.permute(2, 0, 1)
+    return mask
+class MusicTransformerMulti(torch.nn.Module):
+    def __init__(self, embedding_dim=None, d_inner=None, d_condition=None, vocab_size=None, num_layer=None, num_head=None,
+                 max_seq=None, dropout=None, pad_token=None,
+                 ):
+        super().__init__()
+        self.max_seq = max_seq
+        self.num_layer = num_layer
+        self.embedding_dim = embedding_dim
+        self.vocab_size = vocab_size
+        self.pad_token = pad_token
+        d_condition = 0 if d_condition < 0 else d_condition
+        self.d_condition = d_condition
+        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
+                                            embedding_dim=self.embedding_dim-self.d_condition,
+                                            padding_idx=pad_token)
+        if self.d_condition > 0:
+            self.fc_condition = torch.nn.Linear(2, self.d_condition)
+        self.pos_encoding = DynamicPositionEmbedding(self.embedding_dim, max_seq=max_seq)
+        self.enc_layers = torch.nn.ModuleList(
+            [EncoderLayer(embedding_dim, d_inner, dropout, h=num_head, additional=False, max_seq=max_seq)
+             for _ in range(num_layer)])
+        self.dropout = torch.nn.Dropout(dropout)
+        self.fc = torch.nn.Linear(self.embedding_dim, self.vocab_size)
+        self.init_weights()
+    def init_weights(self):
+        initrange = 0.1
+        self.embedding.weight.data.uniform_(-initrange, initrange)
+        self.fc.bias.data.zero_()
+        self.fc.weight.data.uniform_(-initrange, initrange)
+        if self.d_condition > 0:
+            self.fc_condition.bias.data.zero_()
+            self.fc_condition.weight.data.uniform_(-initrange, initrange)
+    def forward(self, x, condition):
+        # no_conditioning = not torch.equal(condition, condition)
+        # assert (self.d_condition > 0) != no_conditioning
+        # takes batch first
+        # x.shape = [batch_size, sequence_length]
+        mask = generate_mask(x, self.pad_token)
+        # embed input
+        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
+        x *= math.sqrt(self.embedding_dim-self.d_condition)
+        if self.d_condition > 0:
+            # embed condition using fully connected layer
+            condition = self.fc_condition(condition)
+            # tile to match input
+            condition = condition.unsqueeze(1).expand(-1, x.size(1), -1)
+            x = torch.cat([x, condition], dim=-1)   # concatenate
+        x = self.pos_encoding(x)
+        x = self.dropout(x)
+        for i in range(len(self.enc_layers)):
+            x = self.enc_layers[i](x, mask)
+        x = self.fc(x)
+        return x
+class EncoderLayer(torch.nn.Module):
+    def __init__(self, d_model, d_inner, rate=0.1, h=16, additional=False, max_seq=2048):
+        super(EncoderLayer, self).__init__()
+        self.d_model = d_model
+        self.rga = RelativeGlobalAttention(h=h, d=d_model, max_seq=max_seq, add_emb=additional)
+        self.FFN_pre = torch.nn.Linear(self.d_model, d_inner)
+        self.FFN_suf = torch.nn.Linear(d_inner, self.d_model)
+        self.layernorm1 = torch.nn.LayerNorm(self.d_model, eps=1e-6)
+        self.layernorm2 = torch.nn.LayerNorm(self.d_model, eps=1e-6)
+        self.dropout1 = torch.nn.Dropout(rate)
+        self.dropout2 = torch.nn.Dropout(rate)
+    def forward(self, x, mask=None):
+        attn_out = self.rga([x,x,x], mask)
+        attn_out = self.dropout1(attn_out)
+        out1 = self.layernorm1(attn_out+x)
+        ffn_out = F.relu(self.FFN_pre(out1))
+        ffn_out = self.FFN_suf(ffn_out)
+        ffn_out = self.dropout2(ffn_out)
+        out2 = self.layernorm2(out1+ffn_out)
+        return out2
+def sinusoid(max_seq, embedding_dim):
+    return np.array([[
+        [
+            m.sin(
+                pos * m.exp(-m.log(10000) * i / embedding_dim) * m.exp(
+                    m.log(10000) / embedding_dim * (i % 2)) + 0.5 * m.pi * (i % 2)
+            )
+            for i in range(embedding_dim)
+        ]
+        for pos in range(max_seq)
+    ]])
+class DynamicPositionEmbedding(torch.nn.Module):
+    def __init__(self, embedding_dim, max_seq=2048):
+        super().__init__()
+        self.device = torch.device("cpu")
+        self.dtype = torch.float32
+        embed_sinusoid_list = sinusoid(max_seq, embedding_dim)
+        self.positional_embedding = torch.from_numpy(embed_sinusoid_list).to(
+            self.device, dtype=self.dtype)
+    def forward(self, x):
+        if x.device != self.device or x.dtype != self.dtype:
+            self.positional_embedding = self.positional_embedding.to(x.device, dtype=x.dtype)
+        x += self.positional_embedding[:, :x.size(1), :]
+        return x
+class RelativeGlobalAttention(torch.nn.Module):
+    """
+    from Music Transformer ( Huang et al, 2018 )
+    [paper link](https://arxiv.org/pdf/1809.04281.pdf)
+    """
+    def __init__(self, h=4, d=256, add_emb=False, max_seq=2048):
+        super().__init__()
+        self.len_k = None
+        self.max_seq = max_seq
+        self.E = None
+        self.h = h
+        self.d = d
+        self.dh = d // h
+        self.Wq = torch.nn.Linear(self.d, self.d)
+        self.Wk = torch.nn.Linear(self.d, self.d)
+        self.Wv = torch.nn.Linear(self.d, self.d)
+        self.fc = torch.nn.Linear(d, d)
+        self.additional = add_emb
+        self.E = torch.nn.Parameter(torch.randn([self.max_seq, int(self.dh)]))
+        if self.additional:
+            self.Radd = None
+    def forward(self, inputs, mask=None):
+        """
+        :param inputs: a list of tensors. i.e) [Q, K, V]
+        :param mask: mask tensor
+        :param kwargs:
+        :return: final tensor ( output of attention )
+        """
+        q = inputs[0]
+        q = self.Wq(q)
+        q = torch.reshape(q, (q.size(0), q.size(1), self.h, -1))
+        q = q.permute(0, 2, 1, 3)  # batch, h, seq, dh
+        k = inputs[1]
+        k = self.Wk(k)
+        k = torch.reshape(k, (k.size(0), k.size(1), self.h, -1))
+        k = k.permute(0, 2, 1, 3)
+        v = inputs[2]
+        v = self.Wv(v)
+        v = torch.reshape(v, (v.size(0), v.size(1), self.h, -1))
+        v = v.permute(0, 2, 1, 3)
+        self.len_k = k.size(2)
+        self.len_q = q.size(2)
+        E = self._get_left_embedding(self.len_q, self.len_k).to(q.device)
+        QE = torch.einsum('bhld,md->bhlm', [q, E])
+        QE = self._qe_masking(QE)
+        Srel = self._skewing(QE)
+        Kt = k.permute(0, 1, 3, 2)
+        QKt = torch.matmul(q, Kt)
+        logits = QKt + Srel
+        logits = logits / math.sqrt(self.dh)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            new_mask = torch.zeros_like(mask, dtype=torch.float)
+            new_mask.masked_fill_(mask, float("-inf"))
+            mask = new_mask
+            logits += mask
+        attention_weights = F.softmax(logits, -1)
+        attention = torch.matmul(attention_weights, v)
+        out = attention.permute(0, 2, 1, 3)
+        out = torch.reshape(out, (out.size(0), -1, self.d))
+        out = self.fc(out)
+        return out
+    def _get_left_embedding(self, len_q, len_k):
+        starting_point = max(0,self.max_seq-len_q)
+        e = self.E[starting_point:,:]
+        return e
+    def _skewing(self, tensor: torch.Tensor):
+        padded = F.pad(tensor, [1, 0, 0, 0, 0, 0, 0, 0])
+        reshaped = torch.reshape(padded, shape=[padded.size(0), padded.size(1), padded.size(-1), padded.size(-2)])
+        Srel = reshaped[:, :, 1:, :]
+        if self.len_k > self.len_q:
+            Srel = F.pad(Srel, [0, 0, 0, 0, 0, 0, 0, self.len_k-self.len_q])
+        elif self.len_k < self.len_q:
+            Srel = Srel[:, :, :, :self.len_k]
+        return Srel
+    @staticmethod
+    def _qe_masking(qe):
+        mask = sequence_mask(
+            torch.arange(qe.size()[-1] - 1, qe.size()[-1] - qe.size()[-2] - 1, -1).to(qe.device),
+            qe.size()[-1])
+        mask = ~mask.to(mask.device)
+        return mask.to(qe.dtype) * qe
+def sequence_mask(length, max_length=None):
+    """Tensorflow의 sequence_mask를 구현"""
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)

midi_emotion/src/models/music_regression.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import torch
+import math as m
+import numpy as np
+import math
+import torch.nn.functional as F
+import sys
+# from torch.nn.modules.activation import ReLU
+sys.path.append("..")
+# from utils import memory
+"""
+MUSIC TRANSFORMER REGRESSION (to output emotion)
+"""
+def generate_mask(x, pad_token=None, batch_first=True):
+    batch_size = x.size(0)
+    seq_len = x.size(1)
+    subsequent_mask = torch.logical_not(torch.triu(torch.ones(seq_len, seq_len, device=x.device)).t()).unsqueeze(
+        -1).repeat(1, 1, batch_size)
+    pad_mask = x == pad_token
+    if batch_first:
+        pad_mask = pad_mask.t()
+    mask = torch.logical_or(subsequent_mask, pad_mask)
+    if batch_first:
+        mask = mask.permute(2, 0, 1)
+    return mask
+class MusicRegression(torch.nn.Module):
+    def __init__(self, embedding_dim=None, d_inner=None, vocab_size=None, num_layer=None, num_head=None,
+                 max_seq=None, dropout=None, pad_token=None, output_size=None,
+                 d_condition=-1, no_mask=True
+                 ):
+        super().__init__()
+        assert d_condition <= 0
+        self.max_seq = max_seq
+        self.num_layer = num_layer
+        self.embedding_dim = embedding_dim
+        self.vocab_size = vocab_size
+        self.pad_token = pad_token
+        self.no_mask = no_mask
+        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
+                                            embedding_dim=self.embedding_dim,
+                                            padding_idx=pad_token)
+        self.pos_encoding = DynamicPositionEmbedding(self.embedding_dim, max_seq=max_seq)
+        self.enc_layers = torch.nn.ModuleList(
+            [EncoderLayer(embedding_dim, d_inner, dropout, h=num_head, additional=False, max_seq=max_seq)
+             for _ in range(num_layer)])
+        self.dropout = torch.nn.Dropout(dropout)
+        self.fc = torch.nn.Sequential(
+            torch.nn.Linear(self.embedding_dim, output_size),
+            torch.nn.Tanh()
+        )
+        self.init_weights()
+    def init_weights(self):
+        initrange = 0.1
+        self.embedding.weight.data.uniform_(-initrange, initrange)
+    def forward(self, x):
+        mask = None if self.no_mask else generate_mask(x, self.pad_token)
+        # embed input
+        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
+        x *= math.sqrt(self.embedding_dim)
+        x = self.pos_encoding(x)
+        x = self.dropout(x)
+        for i in range(len(self.enc_layers)):
+            x = self.enc_layers[i](x, mask)
+        x = self.fc(x[:, 0, :])
+        return x
+class EncoderLayer(torch.nn.Module):
+    def __init__(self, d_model, d_inner, rate=0.1, h=16, additional=False, max_seq=2048):
+        super(EncoderLayer, self).__init__()
+        self.d_model = d_model
+        self.rga = RelativeGlobalAttention(h=h, d=d_model, max_seq=max_seq, add_emb=additional)
+        self.FFN_pre = torch.nn.Linear(self.d_model, d_inner)
+        self.FFN_suf = torch.nn.Linear(d_inner, self.d_model)
+        self.layernorm1 = torch.nn.LayerNorm(self.d_model, eps=1e-6)
+        self.layernorm2 = torch.nn.LayerNorm(self.d_model, eps=1e-6)
+        self.dropout1 = torch.nn.Dropout(rate)
+        self.dropout2 = torch.nn.Dropout(rate)
+    def forward(self, x, mask=None):
+        attn_out = self.rga([x,x,x], mask)
+        attn_out = self.dropout1(attn_out)
+        out1 = self.layernorm1(attn_out+x)
+        ffn_out = F.relu(self.FFN_pre(out1))
+        ffn_out = self.FFN_suf(ffn_out)
+        ffn_out = self.dropout2(ffn_out)
+        out2 = self.layernorm2(out1+ffn_out)
+        return out2
+def sinusoid(max_seq, embedding_dim):
+    return np.array([[
+        [
+            m.sin(
+                pos * m.exp(-m.log(10000) * i / embedding_dim) * m.exp(
+                    m.log(10000) / embedding_dim * (i % 2)) + 0.5 * m.pi * (i % 2)
+            )
+            for i in range(embedding_dim)
+        ]
+        for pos in range(max_seq)
+    ]])
+class DynamicPositionEmbedding(torch.nn.Module):
+    def __init__(self, embedding_dim, max_seq=2048):
+        super().__init__()
+        self.device = torch.device("cpu")
+        self.dtype = torch.float32
+        embed_sinusoid_list = sinusoid(max_seq, embedding_dim)
+        self.positional_embedding = torch.from_numpy(embed_sinusoid_list).to(
+            self.device, dtype=self.dtype)
+    def forward(self, x):
+        if x.device != self.device or x.dtype != self.dtype:
+            self.positional_embedding = self.positional_embedding.to(x.device, dtype=x.dtype)
+        x += self.positional_embedding[:, :x.size(1), :]
+        return x
+class RelativeGlobalAttention(torch.nn.Module):
+    """
+    from Music Transformer ( Huang et al, 2018 )
+    [paper link](https://arxiv.org/pdf/1809.04281.pdf)
+    """
+    def __init__(self, h=4, d=256, add_emb=False, max_seq=2048):
+        super().__init__()
+        self.len_k = None
+        self.max_seq = max_seq
+        self.E = None
+        self.h = h
+        self.d = d
+        self.dh = d // h
+        self.Wq = torch.nn.Linear(self.d, self.d)
+        self.Wk = torch.nn.Linear(self.d, self.d)
+        self.Wv = torch.nn.Linear(self.d, self.d)
+        self.fc = torch.nn.Linear(d, d)
+        self.additional = add_emb
+        self.E = torch.nn.Parameter(torch.randn([self.max_seq, int(self.dh)]))
+        if self.additional:
+            self.Radd = None
+    def forward(self, inputs, mask=None):
+        """
+        :param inputs: a list of tensors. i.e) [Q, K, V]
+        :param mask: mask tensor
+        :param kwargs:
+        :return: final tensor ( output of attention )
+        """
+        q = inputs[0]
+        q = self.Wq(q)
+        q = torch.reshape(q, (q.size(0), q.size(1), self.h, -1))
+        q = q.permute(0, 2, 1, 3)  # batch, h, seq, dh
+        k = inputs[1]
+        k = self.Wk(k)
+        k = torch.reshape(k, (k.size(0), k.size(1), self.h, -1))
+        k = k.permute(0, 2, 1, 3)
+        v = inputs[2]
+        v = self.Wv(v)
+        v = torch.reshape(v, (v.size(0), v.size(1), self.h, -1))
+        v = v.permute(0, 2, 1, 3)
+        self.len_k = k.size(2)
+        self.len_q = q.size(2)
+        E = self._get_left_embedding(self.len_q, self.len_k).to(q.device)
+        QE = torch.einsum('bhld,md->bhlm', [q, E])
+        QE = self._qe_masking(QE)
+        Srel = self._skewing(QE)
+        Kt = k.permute(0, 1, 3, 2)
+        QKt = torch.matmul(q, Kt)
+        logits = QKt + Srel
+        logits = logits / math.sqrt(self.dh)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            new_mask = torch.zeros_like(mask, dtype=torch.float)
+            new_mask.masked_fill_(mask, float("-inf"))
+            mask = new_mask
+            logits += mask
+        attention_weights = F.softmax(logits, -1)
+        attention = torch.matmul(attention_weights, v)
+        out = attention.permute(0, 2, 1, 3)
+        out = torch.reshape(out, (out.size(0), -1, self.d))
+        out = self.fc(out)
+        return out
+    def _get_left_embedding(self, len_q, len_k):
+        starting_point = max(0,self.max_seq-len_q)
+        e = self.E[starting_point:,:]
+        return e
+    def _skewing(self, tensor: torch.Tensor):
+        padded = F.pad(tensor, [1, 0, 0, 0, 0, 0, 0, 0])
+        reshaped = torch.reshape(padded, shape=[padded.size(0), padded.size(1), padded.size(-1), padded.size(-2)])
+        Srel = reshaped[:, :, 1:, :]
+        if self.len_k > self.len_q:
+            Srel = F.pad(Srel, [0, 0, 0, 0, 0, 0, 0, self.len_k-self.len_q])
+        elif self.len_k < self.len_q:
+            Srel = Srel[:, :, :, :self.len_k]
+        return Srel
+    @staticmethod
+    def _qe_masking(qe):
+        mask = sequence_mask(
+            torch.arange(qe.size()[-1] - 1, qe.size()[-1] - qe.size()[-2] - 1, -1).to(qe.device),
+            qe.size()[-1])
+        mask = ~mask.to(mask.device)
+        return mask.to(qe.dtype) * qe
+def sequence_mask(length, max_length=None):
+    """Tensorflow의 sequence_mask를 구현"""
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)

midi_emotion/src/models/transfer_model.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import torch
+import sys
+sys.path.append("..")
+from models.build_model import build_model
+"""
+Transfers model weights.
+You can create a non-trained target model buy running:
+python train.py --log_step 1 --max_step 1 ...
+"""
+trained_model_dir = "20220803-130921"
+new_model_dir = "20220803-131016"
+device = "cuda" if torch.cuda.is_available() else 'cpu'
+main_dir = "../../output"
+trained_config = torch.load(os.path.join(main_dir, trained_model_dir, "model_config.pt"))
+trained_model, _ = build_model(None, load_config_dict=trained_config)
+trained_model = trained_model.to(device)
+trained_model.load_state_dict(torch.load(os.path.join(main_dir, trained_model_dir, 'model.pt'), map_location=device))
+new_config = torch.load(os.path.join(main_dir, new_model_dir, "model_config.pt"))
+new_model, _ = build_model(None, load_config_dict=new_config)
+new_model = new_model.to(device)
+trained_params = trained_model.named_parameters()
+new_params = new_model.named_parameters()
+dict_new_params = dict(new_params)
+for name1, param1 in trained_params:
+    if name1 in dict_new_params:
+        if name1 == 'embedding.weight':
+            # continuous_concat may have different sized embedding
+            size1 = dict_new_params[name1].data.shape[1]
+            size2 = param1.data.shape[1]
+            size_transfer = min((size1, size2))
+            dict_new_params[name1].data[:, :size_transfer] = param1.data[:, :size_transfer]
+        else:
+            dict_new_params[name1].data.copy_(param1.data)
+output_path = os.path.join(main_dir, new_model_dir, 'model.pt')
+torch.save(new_model.state_dict(), output_path)
+print(f"Saved to {output_path}")

midi_emotion/src/models/transformer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+class Transformer(nn.Module):
+    def __init__(self, n_tokens=None, n_layer=None, n_head=None, d_model=None, d_ff=None,
+                dropout=0.0, pad_idx=0):
+        super(Transformer, self).__init__()
+        from torch.nn import TransformerEncoder, TransformerEncoderLayer
+        # self.name = 'Transformer'
+        self.pos_encoder = PositionalEncoding(d_model, dropout)
+        encoder_layers = TransformerEncoderLayer(d_model, n_head, dim_feedforward=d_ff, dropout=dropout)
+        norm = nn.LayerNorm(d_model)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, n_layer, norm=norm)
+        self.encoder = nn.Embedding(n_tokens, d_model, padding_idx=pad_idx)
+        self.d_model = d_model
+        self.decoder = nn.Linear(d_model, n_tokens)
+        self.init_weights()
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+    def forward(self, src, src_mask, src_key_padding_mask=None):
+        src = self.encoder(src) * math.sqrt(self.d_model)
+        src = self.pos_encoder(src)
+        output = self.transformer_encoder(src, src_mask,
+                    src_key_padding_mask=src_key_padding_mask)
+        output = self.decoder(output)
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)

midi_emotion/src/train.py ADDED Viewed

	@@ -0,0 +1,477 @@

+import time
+import math
+import datetime
+import os
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from tqdm import tqdm
+from models.build_model import build_model
+from generate import generate
+from data.preprocess_features import preprocess_features
+from data.loader import Loader
+from data.loader_exhaustive import LoaderExhaustive
+from data.loader_generations import LoaderGenerations
+from data.collate import filter_collate
+from utils import CsvWriter, create_exp_dir, accuracy
+from config import args
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# Set the random seed manually for reproducibility.
+if args.seed > 0:
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    random.seed(args.seed)
+class Runner:
+    def __init__(self):
+        self.logging = create_exp_dir(args.work_dir, debug=args.debug)
+        use_cuda = torch.cuda.is_available() and not args.no_cuda
+        self.device = torch.device('cuda' if use_cuda else 'cpu')
+        if self.device == torch.device("cuda"):
+            self.logging("Using GPU")
+        else:
+            self.logging("Using CPU")
+        self.train_step = 0
+        self.n_sequences_total = 0
+        self.init_hours = 0
+        self.epoch = 0
+        self.init_time = time.time()
+        # Load data
+        n_bins = args.n_emotion_bins if args.conditioning == "discrete_token" and \
+             not args.regression else None
+        conditional = args.conditioning != "none" or args.regression
+        # Preprocessing
+        train_feats, test_feats = preprocess_features(
+            "../data_files/features/pianoroll/full_dataset_features_summarized.csv",
+            n_bins=n_bins, conditional=conditional,
+            use_labeled_only=not args.full_dataset)
+        if args.exhaustive_eval:
+            # Evaluate using ENTIRE test set
+            train_dataset = []
+            test_dataset = LoaderExhaustive(args.data_folder, test_feats, args.tgt_len, args.conditioning,
+                max_samples=args.n_samples, regression=args.regression,
+                always_use_discrete_condition=args.always_use_discrete_condition)
+        else:
+            train_dataset = Loader(args.data_folder, train_feats, args.tgt_len, args.conditioning,
+                regression=args.regression, always_use_discrete_condition=args.always_use_discrete_condition)
+            test_dataset = Loader(args.data_folder, test_feats, args.tgt_len, args.conditioning,
+                regression=args.regression, always_use_discrete_condition=args.always_use_discrete_condition)
+        if args.regression_dir is not None:
+            # Perform emotion regression on generated samples
+            train_dataset = []
+            test_dataset = LoaderGenerations(args.regression_dir, args.tgt_len)
+        self.null_condition = torch.FloatTensor([np.nan, np.nan]).to(self.device)
+        self.maps = test_dataset.get_maps()
+        self.pad_idx = test_dataset.get_pad_idx()
+        self.vocab_size = test_dataset.get_vocab_len()
+        args.vocab_size = self.vocab_size
+        self.logging(f"Number of tokens: {self.vocab_size}")
+        if args.exhaustive_eval or args.regression_dir is not None:
+            self.train_loader = []
+        else:
+            self.train_loader = torch.utils.data.DataLoader(train_dataset, args.batch_size, shuffle=not args.debug,
+                                                    num_workers=args.num_workers, collate_fn=filter_collate,
+                                                    pin_memory=not args.no_cuda, drop_last=True)
+        self.test_loader = torch.utils.data.DataLoader(test_dataset, args.batch_size, shuffle=False,
+                                                    num_workers=args.num_workers, collate_fn=filter_collate,
+                                                    pin_memory=not args.no_cuda and args.regression_dir is None,
+                                                    drop_last=True)
+        print(f"Data loader lengths\nTrain: {len(train_dataset)}")
+        if not args.overfit:
+            print(f"Test:{len(test_dataset)}")
+        self.gen_dir = os.path.join(args.work_dir, "generations", "training")
+        # Automatic mixed precision
+        self.amp = not args.no_amp and self.device == torch.device('cuda')
+        if self.amp:
+            self.logging("Using automatic mixed precision")
+        else:
+            self.logging("Using float32")
+        self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)
+        self.init_model()   # Build the model
+        if not args.debug:
+            # Save mappings
+            os.makedirs(self.gen_dir, exist_ok=True)
+            torch.save(self.maps, os.path.join(args.work_dir, "mappings.pt"))
+        self.csv_writer = CsvWriter(os.path.join(args.work_dir, "performance.csv"),
+            ["epoch", "step", "hour", "lr", "trn_loss", "val_loss", "val_l1_v", "val_l1_a"],
+            in_path=self.csv_in, debug=args.debug)
+        args.n_all_param = sum([p.nelement() for p in self.model.parameters()])
+        self.model = self.model.to(self.device)
+        self.ce_loss = nn.CrossEntropyLoss(ignore_index=self.pad_idx).to(self.device)
+        self.mse_loss = nn.MSELoss()
+        self.l1_loss = nn.L1Loss()
+        #### scheduler
+        if args.scheduler == '--':
+            self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer,
+                args.max_step, eta_min=args.eta_min)
+        elif args.scheduler == 'dev_perf':
+            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
+                factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+        elif args.scheduler == 'constant':
+            pass
+        elif args.scheduler == 'cyclic':
+            self.scheduler = optim.lr_scheduler.CyclicLR(self.optimizer,
+                args.lr_min, args.lr_max, verbose=False, cycle_momentum=False)
+        # Print log
+        if not args.debug:
+            self.logging('=' * 120)
+            for k, v in args.__dict__.items():
+                self.logging('    - {} : {}'.format(k, v))
+            self.logging('=' * 120)
+        self.logging('#params = {}'.format(args.n_all_param))
+        now = datetime.datetime.now()
+        now = now.strftime("%d-%m-%Y %H:%M")
+        self.logging(f"Run started at {now}")
+        self.once = True
+    def init_model(self):
+        # Initialize model
+        if args.restart_dir:
+            # Load existing model
+            config = torch.load(os.path.join(args.restart_dir, "model_config.pt"))
+            self.model, config = build_model(None, load_config_dict=config)
+            self.model = self.model.to(self.device)
+            model_fp = os.path.join(args.restart_dir, 'model.pt')
+            optimizer_fp = os.path.join(args.restart_dir, 'optimizer.pt')
+            stats_fp = os.path.join(args.restart_dir, 'stats.pt')
+            scaler_fp = os.path.join(args.restart_dir, 'scaler.pt')
+            self.model.load_state_dict(
+                torch.load(model_fp, map_location=lambda storage, loc: storage))
+            self.logging(f"Model loaded from {model_fp}")
+            self.csv_in = os.path.join(args.restart_dir, 'performance.csv')
+        else:
+            # Build model from scratch
+            self.csv_in = None
+            self.model, config = build_model(vars(args))
+            self.model = self.model.to(self.device)
+        # save model configuration for later load
+        if not args.debug:
+            torch.save(config, os.path.join(args.work_dir, "model_config.pt"))
+        self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
+        # Load self.optimizer if necessary
+        if args.restart_dir:
+            if os.path.exists(optimizer_fp):
+                try:
+                    self.optimizer.load_state_dict(
+                        torch.load(optimizer_fp, map_location=lambda storage, loc: storage))
+                except:
+                    pass
+            else:
+                print('Optimizer was not saved. Start from scratch.')
+            try:
+                stats = torch.load(stats_fp)
+                self.train_step = stats["step"]
+                self.init_hours = stats["hour"]
+                self.epoch = stats["epoch"]
+                self.n_sequences_total = stats["sample"]
+            except:
+                self.train_step = 0
+                self.init_hours = 0
+                self.epoch = 0
+                self.n_sequences_total = 0
+            if os.path.exists(scaler_fp) and not args.reset_scaler:
+                try:
+                    self.scaler.load_state_dict(torch.load(scaler_fp))
+                except:
+                    pass
+            if args.overwrite_lr:
+                # New learning rate
+                for p in self.optimizer.param_groups:
+                    p['lr'] = args.lr
+    ###############################################################################
+    # EVALUATION
+    ###############################################################################
+    def evaluate(self):
+        # Turn on evaluation mode which disables dropout.
+        self.model.eval()
+        # Evaluation
+        topk = (1, 5)   # find accuracy for top-1 and top-5
+        n_elements_total, n_sequences_total, total_loss = 0, 0, 0.
+        total_accs = {"l1_v": 0., "l1_a": 0., "l1_mean": 0., "l1_mean_normal":0
+            } if args.regression else {k: 0. for k in topk}
+        with torch.no_grad():
+            n_batches = len(self.test_loader)
+            loader = enumerate(self.test_loader)
+            if args.exhaustive_eval or args.regression:
+                loader = tqdm(loader, total=n_batches)
+            for i, (input_, condition, target) in loader:
+                if args.max_eval_step > 0 and i >= args.max_eval_step:
+                    break
+                if input_ != []:
+                    input_ = input_.to(self.device)
+                    condition = condition.to(self.device)
+                    if not args.regression:
+                        target = target.to(self.device)
+                    loss, pred = self.forward_pass(input_, condition, target)
+                    if args.regression:
+                        pred = torch.clamp(pred, min=-1.0, max=1.0)
+                        loss = self.l1_loss(pred, condition)
+                        l1_v = self.l1_loss(pred[:, 0], condition[:, 0]).item()
+                        l1_a = self.l1_loss(pred[:, 1], condition[:, 1]).item()
+                        accuracies = {"l1_v": l1_v, "l1_a": l1_a,
+                                      "l1_mean": (l1_v + l1_a) / 2,
+                                      "l1_mean_normal": (l1_v + l1_a) / 2 / 2}
+                        n_elements = pred[:, 0].numel()
+                    else:
+                        accuracies = accuracy(pred, target, topk=topk, ignore_index=self.pad_idx)
+                        n_elements = input_.numel()
+                    n_sequences = input_.size(0)
+                    total_loss += n_elements * loss.item()
+                    for key, value in accuracies.items():
+                        total_accs[key] += n_elements * value
+                    n_elements_total += n_elements
+                    n_sequences_total += n_sequences
+            if n_elements_total == 0:
+                avg_loss = float('nan')
+                avg_accs = float('nan')
+            else:
+                avg_loss = total_loss / n_elements_total
+                avg_accs = {k: v/n_elements_total for k, v in total_accs.items()}
+            if args.exhaustive_eval:
+                print(f"Total number of sequences: {n_sequences_total}")
+            return avg_loss, avg_accs
+    def forward_pass(self, input_, condition, target):
+        input_ = input_.to(self.device)
+        condition = condition.to(self.device)
+        with torch.cuda.amp.autocast(enabled=self.amp):
+            if args.regression:
+                output = self.model(input_)
+                loss = self.l1_loss(output, condition)
+            else:
+                target = target.to(self.device)
+                output = self.model(input_, condition)
+                output_flat = output.reshape(-1, output.size(-1))
+                target = target.reshape(-1)
+                loss = self.ce_loss(output_flat, target)
+        return loss, output
+    def train(self):
+        # Turn on training mode which enables dropout.
+        self.model.train()
+        train_loss = 0
+        n_elements_total = 0
+        train_interval_start = time.time()
+        while True:
+            for input_, condition, target in self.train_loader:
+                self.model.train()
+                if input_ != []:
+                    loss, _ = self.forward_pass(input_, condition, target)
+                    loss_val = loss.item()
+                    loss /= args.accumulate_step
+                    n_elements = input_.numel()
+                    if not math.isnan(loss_val):
+                        train_loss += n_elements * loss_val
+                        n_elements_total += n_elements
+                    self.n_sequences_total += input_.size(0)
+                    self.scaler.scale(loss).backward()
+                    if self.train_step % args.accumulate_step == 0:
+                        self.scaler.unscale_(self.optimizer)
+                        if args.clip > 0:
+                            torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.clip)
+                        self.scaler.step(self.optimizer)
+                        self.scaler.update()
+                        self.model.zero_grad()
+                    if args.scheduler != "constant":
+                        # linear warmup stage
+                        if self.train_step <= args.warmup_step:
+                            curr_lr = args.lr * self.train_step / args.warmup_step
+                            self.optimizer.param_groups[0]['lr'] = curr_lr
+                        else:
+                            self.scheduler.step()
+                if (self.train_step % args.gen_step == 0) and self.train_step > 0 and not args.regression:
+                    # Generate and save samples
+                    with torch.no_grad():
+                        self.model.eval()
+                        if args.max_gen_input_len > 0:
+                            max_input_len = args.max_gen_input_len
+                        else:
+                            max_input_len = args.tgt_len
+                        primers = [["<START>"]]
+                        # Use fixed set of conditions
+                        if args.conditioning == "none":
+                            discrete_conditions = None
+                            continuous_conditions = None
+                            primers = [["<START>"] for _ in range(4)]
+                        elif args.conditioning == "discrete_token":
+                            discrete_conditions = [
+                                ["<V-2>", "<A-2>"],
+                                ["<V-2>", "<A2>"],
+                                ["<V2>", "<A-2>"],
+                                ["<V2>", "<A2>"],
+                                ]
+                            continuous_conditions = None
+                        elif args.conditioning in ["continuous_token", "continuous_concat"]:
+                            discrete_conditions = None
+                            continuous_conditions = [
+                                        [-0.8, -0.8],
+                                        [-0.8, 0.8],
+                                        [0.8, -0.8],
+                                        [0.8, 0.8]
+                                        ]
+                        generate(self.model, self.maps, self.device, self.gen_dir, args.conditioning,
+                            debug=args.debug, verbose=False, amp=self.amp, discrete_conditions=discrete_conditions,
+                            continuous_conditions=continuous_conditions, min_n_instruments=1,
+                            gen_len=args.gen_len, max_input_len=max_input_len,
+                            step=str(self.train_step), primers=primers,
+                            temperatures=[args.temp_note, args.temp_rest])
+                if (self.train_step % args.log_step == 0):
+                    # Print log
+                    if n_elements_total > 0:
+                        cur_loss = train_loss / n_elements_total
+                        elapsed_total = time.time() - self.init_time
+                        elapsed_interval = time.time() - train_interval_start
+                        hours_elapsed = elapsed_total / 3600.0
+                        hours_total = self.init_hours + hours_elapsed
+                        lr = self.optimizer.param_groups[0]['lr']
+                        log_str = '| Epoch {:3d} step {:>8d} | {:>6d} sequences  | {:>3.1f} h | lr {:.2e} ' \
+                                '| ms/batch {:4.0f} | loss {:7.4f}'.format(
+                            self.epoch, self.train_step, self.n_sequences_total, hours_total, lr,
+                            elapsed_interval * 1000 / args.log_step, cur_loss)
+                        self.logging(log_str)
+                        self.csv_writer.update({"epoch": self.epoch, "step": self.train_step, "hour": hours_total,
+                                                "lr": lr, "trn_loss": cur_loss, "val_loss": np.nan,
+                                                "val_l1_v": np.nan, "val_l1_a": np.nan})
+                        train_loss = 0
+                        n_elements_total = 0
+                        self.n_good_output, self.n_nan_output = 0, 0
+                        train_interval_start = time.time()
+                        if not args.debug:
+                            # Save model
+                            model_fp = os.path.join(args.work_dir, 'model.pt')
+                            torch.save(self.model.state_dict(), model_fp)
+                            optimizer_fp = os.path.join(args.work_dir, 'optimizer.pt')
+                            torch.save(self.optimizer.state_dict(), optimizer_fp)
+                            scaler_fp = os.path.join(args.work_dir, 'scaler.pt')
+                            torch.save(self.scaler.state_dict(), scaler_fp)
+                            torch.save({"step": self.train_step, "hour": hours_total, "epoch": self.epoch,
+                                        "sample": self.n_sequences_total},
+                                        os.path.join(args.work_dir, 'stats.pt'))
+                if (self.train_step % args.eval_step == 0):
+                    # Evaluate model
+                    val_loss, val_acc = self.evaluate()
+                    elapsed_total = time.time() - self.init_time
+                    hours_elapsed = elapsed_total / 3600.0
+                    hours_total = self.init_hours + hours_elapsed
+                    lr = self.optimizer.param_groups[0]['lr']
+                    self.logging('-' * 120)
+                    log_str = '| Eval  {:3d} step {:>8d} | now: {} | {:>3.1f} h' \
+                            '| valid loss {:7.4f} | ppl {:5.3f}'.format(
+                        self.train_step // args.eval_step, self.train_step,
+                        time.strftime("%d-%m - %H:%M"), hours_total,
+                        val_loss, math.exp(val_loss))
+                    if args.regression:
+                        log_str += " | l1_v: {:5.3f} | l1_a: {:5.3f}".format(
+                            val_acc["l1_v"], val_acc["l1_a"])
+                    self.csv_writer.update({"epoch": self.epoch, "step": self.train_step, "hour": hours_total,
+                                                "lr": lr, "trn_loss": np.nan, "val_loss": val_loss})
+                    self.logging(log_str)
+                    self.logging('-' * 120)
+                    # dev-performance based learning rate annealing
+                    if args.scheduler == 'dev_perf':
+                        self.scheduler.step(val_loss)
+                if self.train_step >= args.max_step:
+                    break
+                self.train_step += 1
+            self.epoch += 1
+            if self.train_step >= args.max_step:
+                break
+    def run(self):
+        # Loop over epochs.
+        # At any point you can hit Ctrl + C to break out of training early.
+        try:
+            if args.exhaustive_eval or args.regression_dir is not None:
+                self.logging("Exhaustive evaluation")
+                if args.regression_dir is not None:
+                    self.logging(f"For regression on folder {args.regression_dir}")
+                loss, accuracies = self.evaluate()
+                perplexity = math.exp(loss)
+                elapsed_total = time.time() - self.init_time
+                hours_elapsed = elapsed_total / 3600.0
+                msg = f"Loss: {loss:7.4f}, ppl: {perplexity:5.2f}"
+                for k, v in accuracies.items():
+                    if args.regression:
+                        msg += f", {k}: {v:7.4f}"
+                    else:
+                        msg += f", top{k:1.0f}: {v:7.4f}"
+                msg += f", hours: {hours_elapsed:3.1f}"
+                self.logging(msg)
+            else:
+                while True:
+                    self.train()
+                    if self.train_step >= args.max_step:
+                        self.logging('-' * 120)
+                        self.logging('End of training')
+                        break
+        except KeyboardInterrupt:
+            self.logging('-' * 120)
+            self.logging('Exiting from training early')
+if __name__ == "__main__":
+    runner = Runner()
+    runner.run()

midi_emotion/src/utils.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch
+import csv
+import shutil
+import functools
+import os
+def split_list(alist, n_parts):
+    if n_parts == 0:
+        n_parts = 1
+    length = len(alist)
+    return [ alist[i*length // n_parts: (i+1)*length // n_parts]
+            for i in range(n_parts)]
+def accuracy(output: torch.Tensor, target: torch.Tensor, topk=(1, 5), ignore_index=None):
+    """
+    Computes the accuracy over the k top predictions for the specified values of k
+    In top-5 accuracy you give yourself credit for having the right answer
+    if the right answer appears in your top five guesses.
+    ref:
+    - https://discuss.pytorch.org/t/top-k-error-calculation/48815/3
+    - https://pytorch.org/docs/stable/generated/torch.topk.html
+    - https://discuss.pytorch.org/t/imagenet-example-accuracy-calculation/7840
+    - https://gist.github.com/weiaicunzai/2a5ae6eac6712c70bde0630f3e76b77b
+    - https://discuss.pytorch.org/t/top-k-error-calculation/48815/2
+    - https://stackoverflow.com/questions/59474987/how-to-get-top-k-accuracy-in-semantic-segmentation-using-pytorch
+    :param output: output is the prediction of the model e.g. scores, logits, raw y_pred before normalization or getting classes
+    :param target: target is the truth
+    :param topk: tuple of topk's to compute e.g. (1, 2, 5) computes top 1, top 2 and top 5.
+    e.g. in top 2 it means you get a +1 if your models's top 2 predictions are in the right label.
+    So if your model predicts cat, dog (0, 1) and the true label was bird (3) you get zero
+    but if it were either cat or dog you'd accumulate +1 for that example.
+    :return: list of topk accuracy [top1st, top2nd, ...] depending on your topk input
+    """
+    with torch.no_grad():
+        # ---- get the topk most likely labels according to your model
+        # get the largest k \in [n_classes] (i.e. the number of most likely probabilities we will use)
+        maxk = max(topk)  # max number labels we will consider in the right choices for out model
+        output = output.reshape(-1, output.size(-1))
+        target = target.reshape(-1)
+        valid_inds = torch.where(target != ignore_index)[0]
+        target = target[valid_inds]
+        output = output[valid_inds, :]
+        sample_size = target.size(0)
+        # get top maxk indicies that correspond to the most likely probability scores
+        # (note _ means we don't care about the actual top maxk scores just their corresponding indicies/labels)
+        _, y_pred = output.topk(k=maxk, dim=-1)  # _, [B, n_classes] -> [B, maxk]
+        y_pred = y_pred.t()  # [B, maxk] -> [maxk, B] Expects input to be <= 2-D tensor and transposes dimensions 0 and 1.
+        # - get the credit for each example if the models predictions is in maxk values (main crux of code)
+        # for any example, the model will get credit if it's prediction matches the ground truth
+        # for each example we compare if the model's best prediction matches the truth. If yes we get an entry of 1.
+        # if the k'th top answer of the model matches the truth we get 1.
+        # Note: this for any example in batch we can only ever get 1 match (so we never overestimate accuracy <1)
+        target_reshaped = target.view(1, -1).expand_as(y_pred)  # [B] -> [B, 1] -> [maxk, B]
+        # compare every topk's model prediction with the ground truth & give credit if any matches the ground truth
+        correct = (y_pred == target_reshaped)  # [maxk, B] were for each example we know which topk prediction matched truth
+        # original: correct = pred.eq(target.view(1, -1).expand_as(pred))
+        # -- get topk accuracy
+        list_topk_accs = {}
+        for k in topk:
+            # get tensor of which topk answer was right
+            ind_which_topk_matched_truth = correct[:k]  # [maxk, B] -> [k, B]
+            # flatten it to help compute if we got it correct for each example in batch
+            flattened_indicator_which_topk_matched_truth = ind_which_topk_matched_truth.reshape(-1).float()  # [k, B] -> [kB]
+            # get if we got it right for any of our top k prediction for each example in batch
+            tot_correct_topk = flattened_indicator_which_topk_matched_truth.float().sum(dim=0, keepdim=True)  # [kB] -> [1]
+            # compute topk accuracy - the accuracy of the mode's ability to get it right within it's top k guesses/preds
+            topk_acc = tot_correct_topk / sample_size  # topk accuracy for entire batch
+            list_topk_accs[k] = topk_acc.item()
+        return list_topk_accs  # list of topk accuracies for entire batch [topk1, topk2, ... etc]
+class CsvWriter:
+    # Save performance as a csv file
+    def __init__(self, out_path, fieldnames, in_path=None, debug=False):
+        self.out_path = out_path
+        self.fieldnames = fieldnames
+        self.debug = debug
+        if not debug:
+            if in_path is None:
+                with open(out_path, "w") as f:
+                    writer = csv.DictWriter(f, fieldnames=fieldnames)
+                    writer.writeheader()
+            else:
+                try:
+                    shutil.copy(in_path, out_path)
+                except:
+                    with open(out_path, "w") as f:
+                        writer = csv.DictWriter(f, fieldnames=fieldnames)
+                        writer.writeheader()
+    def update(self, performance_dict):
+        if not self.debug:
+            with open(self.out_path, "a") as f:
+                writer = csv.DictWriter(f, fieldnames=self.fieldnames)
+                writer.writerow(performance_dict)
+            a = 0
+def generate_square_subsequent_mask(sz):
+    # Triangular mask to avoid looking at future tokens
+    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+    return mask
+def logging(s, log_path, print_=True, log_=True):
+    # Prints log
+    if print_:
+        print(s)
+    if log_:
+        with open(log_path, 'a+') as f_log:
+            f_log.write(s + '\n')
+def get_logger(log_path, **kwargs):
+    return functools.partial(logging, log_path=log_path, **kwargs)
+def create_exp_dir(dir_path, debug=False):
+    # Create experiment directory
+    if debug:
+        print('Debug Mode : no experiment dir created')
+        return functools.partial(logging, log_path=None, log_=False)
+    else:
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+        print('Experiment dir : {}'.format(dir_path))
+    return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
+def get_n_instruments(symbols):
+    # Find number of instruments
+    symbols_split = [s.split("_") for s in symbols]
+    symbols_split = [s[1] for s in symbols_split if len(s) == 3]
+    events = list(set(symbols_split))
+    return len(events)

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fluidsynth
2	+ fluid-soundfont-gm

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+torch>=2.0.0
+numpy>=1.24.0
+matplotlib>=3.7.0
+Pillow>=10.0.0
+huggingface-hub>=0.19.0
+pretty-midi>=0.2.10
+librosa>=0.10.0
+soundfile>=0.12.0