Kevin King commited on
Commit
555549c
·
1 Parent(s): 4b260d9

Refactor Streamlit app to support video uploads for emotion analysis and enhance audio processing capabilities

Browse files
Files changed (2) hide show
  1. requirements.txt +18 -4
  2. src/streamlit_app.py +145 -31
requirements.txt CHANGED
@@ -1,12 +1,26 @@
1
  --extra-index-url https://download.pytorch.org/whl/cpu
2
 
3
- # Core app library
4
  streamlit==1.35.0
5
 
6
- # AI libraries for this test
 
 
 
 
 
7
  openai-whisper==20231117
 
 
 
 
8
  torch==2.7.0
9
  torchaudio==2.7.0
10
 
11
- # Utility for handling audio files
12
- soundfile
 
 
 
 
 
 
1
  --extra-index-url https://download.pytorch.org/whl/cpu
2
 
3
+ # Pin the main UI components to recent, stable versions
4
  streamlit==1.35.0
5
 
6
+ # Library for video/audio file handling
7
+ moviepy==1.0.3
8
+
9
+ # Pin ML/AI libraries to modern, known-good versions
10
+ transformers==4.40.1
11
+ deepface==0.0.94
12
  openai-whisper==20231117
13
+
14
+ # Pin frameworks to ensure CPU versions and prevent build timeouts
15
+ tensorflow-cpu==2.16.1
16
+ tf-keras==2.16.0
17
  torch==2.7.0
18
  torchaudio==2.7.0
19
 
20
+ # Pin data/audio libraries for stability
21
+ pandas==2.2.2
22
+ numpy==1.26.4
23
+ soundfile==0.12.1
24
+ librosa==0.10.1
25
+ scipy==1.13.0
26
+ Pillow==10.3.0
src/streamlit_app.py CHANGED
@@ -1,60 +1,174 @@
1
  import os
2
  import streamlit as st
 
 
3
  import whisper
4
- import tempfile
 
5
  import logging
 
 
 
 
 
6
 
7
- # --- Set cache directory for models ---
8
- # Point the cache to a writable directory in the cloud environment
9
  os.environ['HF_HOME'] = '/tmp/huggingface'
10
 
11
  # --- Page Configuration ---
12
  st.set_page_config(
13
- page_title="Audio Transcription Test",
14
- page_icon="🎤",
15
- layout="centered"
16
  )
17
 
18
- st.title("Step 2: Audio Transcription (Whisper) Test")
19
- st.write("Upload a short audio file (.wav, .mp3, .m4a) to test the Whisper model.")
20
 
21
  # --- Logger Configuration ---
22
  logging.basicConfig(level=logging.INFO)
 
23
  logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # --- Model Loading ---
26
  @st.cache_resource
27
- def load_whisper_model():
28
- with st.spinner("Loading Whisper model... This may take a moment."):
29
- # Load the base model and specify a writable cache directory
30
- model = whisper.load_model("base", download_root="/tmp/whisper_cache")
31
- return model
32
-
33
- whisper_model = load_whisper_model()
 
 
34
 
 
35
 
36
  # --- UI and Processing Logic ---
37
- uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"])
38
 
39
  if uploaded_file is not None:
40
  # Save the uploaded file to a temporary location
41
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tfile:
42
  tfile.write(uploaded_file.read())
43
- temp_audio_path = tfile.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- st.audio(temp_audio_path)
 
 
46
 
47
- with st.spinner("Transcribing audio..."):
48
- try:
49
- # Transcribe the audio file using Whisper
50
- result = whisper_model.transcribe(temp_audio_path, fp16=False)
51
- transcribed_text = result.get("text", "No text transcribed.")
 
 
 
52
 
53
- st.success("Transcription Complete!")
54
- st.markdown(f"> {transcribed_text}")
 
 
 
 
 
55
 
56
- except Exception as e:
57
- st.error(f"An error occurred during transcription: {e}")
58
- finally:
59
- # Clean up the temporary file
60
- os.unlink(temp_audio_path)
 
1
  import os
2
  import streamlit as st
3
+ import numpy as np
4
+ import torch
5
  import whisper
6
+ from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
7
+ from deepface import DeepFace
8
  import logging
9
+ import soundfile as sf
10
+ import tempfile
11
+ from PIL import Image
12
+ import cv2
13
+ from moviepy.editor import VideoFileClip
14
 
15
+ # Set home directories for model caching to the writable /tmp folder
16
+ os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
17
  os.environ['HF_HOME'] = '/tmp/huggingface'
18
 
19
  # --- Page Configuration ---
20
  st.set_page_config(
21
+ page_title="AffectLink Demo",
22
+ page_icon="😊",
23
+ layout="wide"
24
  )
25
 
26
+ st.title("AffectLink: Post-Hoc Emotion Analysis")
27
+ st.write("Upload a short video clip (under 30 seconds) to analyze facial expressions, speech-to-text, and the emotional tone of the audio.")
28
 
29
  # --- Logger Configuration ---
30
  logging.basicConfig(level=logging.INFO)
31
+ logging.getLogger('deepface').setLevel(logging.ERROR)
32
  logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
33
+ logging.getLogger('moviepy').setLevel(logging.ERROR)
34
+
35
+ # --- Emotion Mappings ---
36
+ UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
37
+ TEXT_TO_UNIFIED = {
38
+ 'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry',
39
+ 'fear': None, 'surprise': None, 'disgust': None
40
+ }
41
+ SER_TO_UNIFIED = {
42
+ 'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'
43
+ }
44
+ AUDIO_SAMPLE_RATE = 16000
45
 
46
  # --- Model Loading ---
47
  @st.cache_resource
48
+ def load_models():
49
+ with st.spinner("Loading AI models, this may take a moment..."):
50
+ whisper_model = whisper.load_model("base", download_root="/tmp/whisper_cache")
51
+ text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
52
+ ser_model_name = "superb/hubert-large-superb-er"
53
+ ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
54
+ ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
55
+ # DeepFace loads its own models on first use, no need to preload here.
56
+ return whisper_model, text_classifier, ser_model, ser_feature_extractor
57
 
58
+ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
59
 
60
  # --- UI and Processing Logic ---
61
+ uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
62
 
63
  if uploaded_file is not None:
64
  # Save the uploaded file to a temporary location
65
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
66
  tfile.write(uploaded_file.read())
67
+ temp_video_path = tfile.name
68
+
69
+ st.video(temp_video_path)
70
+
71
+ if st.button("Analyze Video"):
72
+ facial_analysis_results = []
73
+ audio_analysis_results = {}
74
+
75
+ # --- Video Processing for Facial Emotion ---
76
+ with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
77
+ try:
78
+ cap = cv2.VideoCapture(temp_video_path)
79
+ fps = cap.get(cv2.CAP_PROP_FPS)
80
+ if fps == 0:
81
+ fps = 30 # Default to 30 fps if not available
82
+ frame_count = 0
83
+ while cap.isOpened():
84
+ ret, frame = cap.read()
85
+ if not ret:
86
+ break
87
+
88
+ if frame_count % int(fps) == 0:
89
+ timestamp = frame_count / fps
90
+ analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
91
+ if isinstance(analysis, list) and len(analysis) > 0:
92
+ dominant_emotion = analysis[0]['dominant_emotion']
93
+ facial_analysis_results.append((timestamp, dominant_emotion.capitalize()))
94
+ frame_count += 1
95
+ cap.release()
96
+ except Exception as e:
97
+ st.error(f"An error occurred during facial analysis: {e}")
98
+
99
+ # --- Audio Extraction and Processing ---
100
+ with st.spinner("Extracting and analyzing audio..."):
101
+ temp_audio_path = None
102
+ video_clip = None
103
+ try:
104
+ video_clip = VideoFileClip(temp_video_path)
105
+ if video_clip.audio is not None:
106
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
107
+ video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
108
+ temp_audio_path = taudio.name
109
+
110
+ # 1. Speech-to-Text (Whisper)
111
+ result = whisper_model.transcribe(temp_audio_path, fp16=False)
112
+ transcribed_text = result['text'] if result['text'] else "No speech detected."
113
+ audio_analysis_results['Transcription'] = transcribed_text
114
+
115
+ # 2. Text-based Emotion
116
+ if "No speech detected" not in transcribed_text:
117
+ text_emotions = text_classifier(transcribed_text)[0]
118
+ unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
119
+ for emo in text_emotions:
120
+ unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
121
+ if unified_emo:
122
+ unified_text_scores[unified_emo] += emo['score']
123
+ dominant_text_emotion = max(unified_text_scores, key=unified_text_scores.get)
124
+ audio_analysis_results['Text Emotion'] = dominant_text_emotion.capitalize()
125
+
126
+ # 3. Speech Emotion Recognition (SER)
127
+ audio_array, _ = sf.read(temp_audio_path)
128
+ inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
129
+ with torch.no_grad():
130
+ logits = ser_model(**inputs).logits
131
+ scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
132
+ unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
133
+ for i, score in enumerate(scores):
134
+ raw_emo = ser_model.config.id2label[i]
135
+ unified_emo = SER_TO_UNIFIED.get(raw_emo)
136
+ if unified_emo:
137
+ unified_ser_scores[unified_emo] += score.item()
138
+ dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
139
+ audio_analysis_results['Speech Emotion'] = dominant_ser_emotion.capitalize()
140
+ else:
141
+ audio_analysis_results['Transcription'] = "No audio track found in the video."
142
+
143
+ except Exception as e:
144
+ st.error(f"An error occurred during audio analysis: {e}")
145
+ finally:
146
+ if video_clip:
147
+ video_clip.close()
148
+ if temp_audio_path and os.path.exists(temp_audio_path):
149
+ os.unlink(temp_audio_path)
150
 
151
+ # --- Display Results ---
152
+ st.header("Analysis Results")
153
+ col1, col2 = st.columns(2)
154
 
155
+ with col1:
156
+ st.subheader("Audio Analysis")
157
+ if audio_analysis_results:
158
+ st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
159
+ st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
160
+ st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
161
+ else:
162
+ st.write("No audio results to display.")
163
 
164
+ with col2:
165
+ st.subheader("Facial Expression Timeline")
166
+ if facial_analysis_results:
167
+ for timestamp, emotion in facial_analysis_results:
168
+ st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
169
+ else:
170
+ st.write("No faces detected or video processing failed.")
171
 
172
+ # Clean up temp video file after analysis is done
173
+ if os.path.exists(temp_video_path):
174
+ os.unlink(temp_video_path)