Spaces:
Running
Running
File size: 2,442 Bytes
2482bdc 936f253 2482bdc 936f253 c4f284d 2482bdc 936f253 2482bdc c4f284d db7a8ec c4f284d 2482bdc 936f253 2482bdc db7a8ec ef22f3f 2482bdc ef22f3f db7a8ec 2482bdc c4f284d 2482bdc c4f284d ef22f3f 2482bdc ef22f3f c4f284d 2482bdc c4f284d 2482bdc ef22f3f c4f284d 2482bdc ef22f3f 2482bdc ef22f3f c4f284d ef22f3f c4f284d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import streamlit as st
import torch
import torchaudio
import tempfile
from pydub import AudioSegment
from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification
@st.cache_resource
def load_model():
extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
model.eval()
return extractor, model
def convert_to_wav(uploaded_file):
audio = AudioSegment.from_file(uploaded_file)
audio = audio.set_frame_rate(16000).set_channels(1)
temp_path = tempfile.mktemp(suffix=".wav")
audio.export(temp_path, format="wav")
return temp_path
def get_emotion_label(logits):
emotions = ["angry", "happy", "neutral", "sad"]
scores = torch.softmax(logits, dim=0).tolist()
top_idx = scores.index(max(scores))
return emotions[top_idx], scores
def analyze_emotion(audio_path):
extractor, model = load_model()
waveform, sr = torchaudio.load(audio_path)
# π‘ Trim audio to 30 seconds max to avoid slowdowns
max_duration_sec = 30
max_samples = sr * max_duration_sec
if waveform.size(1) > max_samples:
waveform = waveform[:, :max_samples]
duration_sec = waveform.size(1) / sr
# Run model
inputs = extractor(waveform[0].numpy(), sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits[0]
emotion, scores = get_emotion_label(logits)
return emotion.capitalize(), scores, duration_sec
# Streamlit UI
st.set_page_config(page_title="π§ Audio Emotion Detector", layout="centered")
st.title("π§ Audio Emotion Analysis (Wav2Vec2)")
uploaded_file = st.file_uploader("Upload an MP3 or WAV audio file", type=["mp3", "wav"])
if uploaded_file:
st.audio(uploaded_file, format='audio/wav')
with st.spinner("Analyzing emotion..."):
wav_path = convert_to_wav(uploaded_file)
emotion, scores, duration_sec = analyze_emotion(wav_path)
st.subheader("β± Audio Info:")
st.write(f"Duration analyzed: **{duration_sec:.2f} seconds**")
st.subheader("π§ Detected Emotion:")
st.markdown(f"**{emotion}**")
st.subheader("π― Confidence Scores:")
emotions = ["angry", "happy", "neutral", "sad"]
for i, label in enumerate(emotions):
st.write(f"- **{label.capitalize()}**: {scores[i]*100:.2f}%")
|