import gradio as gr import torch import librosa import numpy as np from transformers import Wav2Vec2Model, Wav2Vec2Processor import torch.nn as nn # Define emotions emotion_list = ['anger', 'disgust', 'fear', 'happy', 'neutral', 'sad'] # Define the model class EmotionClassifier(nn.Module): def __init__(self, num_classes): super(EmotionClassifier, self).__init__() self.wav2vec2 = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base') encoder_layer = nn.TransformerEncoderLayer(d_model=self.wav2vec2.config.hidden_size, nhead=8, batch_first=True) self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2) self.classifier = nn.Linear(self.wav2vec2.config.hidden_size, num_classes) def forward(self, input_values): outputs = self.wav2vec2(input_values).last_hidden_state encoded = self.transformer_encoder(outputs) logits = self.classifier(encoded[:, 0, :]) return logits # Load your trained model model_path = "best_model_state_dict.pth" num_classes = len(emotion_list) model = EmotionClassifier(num_classes) model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) model.eval() # Define processor processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base') def predict_emotion(audio): # Load and process audio audio, sr = librosa.load(audio, sr=16000) inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True).input_values if inputs.ndimension() == 2: # Ensure correct input shape inputs = inputs.squeeze(0) with torch.no_grad(): logits = model(inputs.unsqueeze(0)).squeeze() # Get predicted emotions probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy() predictions = {emotion: float(prob) for emotion, prob in zip(emotion_list, probabilities)} return predictions # Create Gradio interface interface = gr.Interface( fn=predict_emotion, inputs=gr.Audio(type="filepath"), outputs=gr.Label(num_top_classes=3), title="语音情感识别", description="上传音频文件(.wav 或 .mp3)或录制您的声音以预测情感。" ) # Launch the app if __name__ == "__main__": interface.launch()