Spaces:

Oriserve
/

ASR_arena

Running

App Files Files Community

chinmayc3 commited on May 30

Commit

25e924b

1 Parent(s): 48732e0

Added ability to upload model and select model option

Browse files

Files changed (5) hide show

app.py +132 -6
enums.py +3 -0
logger.py +3 -0
requirements.txt +2 -1
utils.py +3 -0

app.py CHANGED Viewed

@@ -15,15 +15,18 @@ import requests
 import streamlit as st
 from audio_recorder_streamlit import audio_recorder
 import torchaudio
 from logger import logger
 from utils import fs
 from enums import SAVE_PATH, ELO_JSON_PATH, ELO_CSV_PATH, EMAIL_PATH, TEMP_DIR, NEW_TASK_URL,ARENA_PATH
 result_queue = Queue()
 random_df = pd.read_csv("random_audios.csv")
 random_paths = random_df["path"].tolist()
 def result_writer_thread():
     result_writer = ResultWriter(SAVE_PATH)
     while True:
@@ -140,9 +143,9 @@ def send_task(payload):
         "Authorization": f"Bearer {os.getenv('CREATE_TASK_API_KEY')}"
     }
     if payload["task"] in ["fetch_audio","write_result"]:
-        response = requests.post(NEW_TASK_URL,json=payload,headers=header,timeout=300)
     else:
-        response = requests.post(NEW_TASK_URL,json=payload,headers=header,timeout=300,stream=True)
     try:
         response = response.json()
     except Exception as e:
@@ -172,6 +175,35 @@ def encode_audio_array(audio_array):
     return base64_string
 def call_function(model_name):
     if st.session_state.current_audio_type == "recorded":
         y,_ = librosa.load(st.session_state.audio_path,sr=22050,mono=True)
@@ -183,6 +215,20 @@ def call_function(model_name):
                     "model_name":model_name,
                     "audio_b64":True
                 }}
     else:
         sr = st.session_state.audio['sample_rate']
         array = st.session_state.audio['data']
@@ -202,7 +248,20 @@ def call_function(model_name):
 def transcribe_audio():
     models_list = ["Ori Apex", "Ori Apex XT", "deepgram", "Ori Swift", "Ori Prime","azure"]
-    model1_name, model2_name = random.sample(models_list, 2)
     st.session_state.option_1_model_name = model1_name
     st.session_state.option_2_model_name = model2_name
@@ -345,7 +404,7 @@ writer_thread = threading.Thread(target=result_writer_thread)
 writer_thread.start()
 def main():
     st.title("⚔️ Ori Speech-To-Text Arena ⚔️")
     if "has_audio" not in st.session_state:
@@ -374,7 +433,12 @@ def main():
         st.session_state.recording = True
     if "disable_voting" not in st.session_state:
         st.session_state.disable_voting = True
-    col1, col2 = st.columns([1, 1])
     with col1:
         st.markdown("### Record Audio")
@@ -406,9 +470,69 @@ def main():
             st.button("🎲 Select Random Audio",on_click=on_random_click,key="random_btn")
             st.session_state.recording = False
     if st.session_state.has_audio:
         st.audio(**st.session_state.audio)
     with st.container():
         st.button("📝 Transcribe Audio",on_click=on_click_transcribe,use_container_width=True,key="transcribe_btn",disabled=st.session_state.recording)
@@ -449,7 +573,8 @@ def main():
     INSTR = """
     ## Instructions:
-    * Record audio to recognise speech (or press 🎲 for random Audio).
     * Click on transcribe audio button to commence the transcription process.
     * Read the two options one after the other while listening to the audio.
     * Vote on which transcript you prefer.
@@ -458,6 +583,7 @@ def main():
         * Currently Hindi and English are supported, and
             the results for Hindi will be in Hinglish (Hindi in Latin script)
         * It may take up to 30 seconds for speech recognition in some cases.
     """.strip()
     st.markdown(INSTR)

 import streamlit as st
 from audio_recorder_streamlit import audio_recorder
 import torchaudio
+from dotenv import load_dotenv
 from logger import logger
 from utils import fs
 from enums import SAVE_PATH, ELO_JSON_PATH, ELO_CSV_PATH, EMAIL_PATH, TEMP_DIR, NEW_TASK_URL,ARENA_PATH
+load_dotenv()
 result_queue = Queue()
 random_df = pd.read_csv("random_audios.csv")
 random_paths = random_df["path"].tolist()
 def result_writer_thread():
     result_writer = ResultWriter(SAVE_PATH)
     while True:
         "Authorization": f"Bearer {os.getenv('CREATE_TASK_API_KEY')}"
     }
     if payload["task"] in ["fetch_audio","write_result"]:
+        response = requests.post(NEW_TASK_URL,json=payload,headers=header,timeout=600)
     else:
+        response = requests.post(NEW_TASK_URL,json=payload,headers=header,timeout=600,stream=True)
     try:
         response = response.json()
     except Exception as e:
     return base64_string
+def validate_uploaded_audio(uploaded_file):
+    """
+    Validate uploaded audio file format and duration
+    Returns: (is_valid, error_message, audio_data, sample_rate)
+    """
+    allowed_extensions = ['.wav', '.mp3', '.flac']
+    file_extension = os.path.splitext(uploaded_file.name)[1].lower()
+    if file_extension not in allowed_extensions:
+        return False, f"Unsupported file format. Please upload {', '.join(allowed_extensions)} files only.", None, None
+    try:
+        audio_bytes = uploaded_file.read()
+        with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as tmp_file:
+            tmp_file.write(audio_bytes)
+            temp_path = tmp_file.name
+            audio_data, sample_rate = librosa.load(temp_path, sr=None)
+            duration = len(audio_data) / sample_rate
+        if duration > 30:
+            return False, f"Audio duration ({duration:.1f}s) exceeds the 30-second limit. Please upload shorter audio.", None, None
+        return True, None, audio_data, sample_rate
+    except Exception as e:
+        return False, f"Error processing audio file: {str(e)}", None, None
 def call_function(model_name):
     if st.session_state.current_audio_type == "recorded":
         y,_ = librosa.load(st.session_state.audio_path,sr=22050,mono=True)
                     "model_name":model_name,
                     "audio_b64":True
                 }}
+    elif st.session_state.current_audio_type == "uploaded":
+        # For uploaded files, use the processed audio data
+        array = st.session_state.audio['data']
+        sr = st.session_state.audio['sample_rate']
+        if sr != 22050:
+            array = librosa.resample(y=array, orig_sr=sr, target_sr=22050)
+        encoded_array = encode_audio_array(array)
+        payload = {
+                "task":"transcribe_with_fastapi",
+                "payload":{
+                    "file_path":encoded_array,
+                    "model_name":model_name,
+                    "audio_b64":True
+                }}
     else:
         sr = st.session_state.audio['sample_rate']
         array = st.session_state.audio['data']
 def transcribe_audio():
     models_list = ["Ori Apex", "Ori Apex XT", "deepgram", "Ori Swift", "Ori Prime","azure"]
+    if st.session_state.model_1_selection == "Random":
+        model1_name = random.choice(models_list)
+    else:
+        model1_name = st.session_state.model_1_selection
+    if st.session_state.model_2_selection == "Random":
+        if st.session_state.model_1_selection == "Random":
+            available_models = [m for m in models_list if m != model1_name]
+            model2_name = random.choice(available_models)
+        else:
+            model2_name = random.choice(models_list)
+    else:
+        model2_name = st.session_state.model_2_selection
     st.session_state.option_1_model_name = model1_name
     st.session_state.option_2_model_name = model2_name
 writer_thread.start()
 def main():
+    st.set_page_config(layout="wide",initial_sidebar_state="collapsed")
     st.title("⚔️ Ori Speech-To-Text Arena ⚔️")
     if "has_audio" not in st.session_state:
         st.session_state.recording = True
     if "disable_voting" not in st.session_state:
         st.session_state.disable_voting = True
+    if "model_1_selection" not in st.session_state:
+        st.session_state.model_1_selection = "Random"
+    if "model_2_selection" not in st.session_state:
+        st.session_state.model_2_selection = "Random"
+    col1, col2, col3 = st.columns([1, 1, 1])
     with col1:
         st.markdown("### Record Audio")
             st.button("🎲 Select Random Audio",on_click=on_random_click,key="random_btn")
             st.session_state.recording = False
+    with col3:
+        st.markdown("### Upload Audio File")
+        with st.container():
+            uploaded_file = st.file_uploader(
+                "Choose an audio file",
+                type=['wav', 'mp3', 'flac'],
+                key="audio_uploader",
+                help="Upload .wav, .mp3, or .flac files (max 30 seconds)"
+            )
+            if uploaded_file is not None:
+                if uploaded_file != st.session_state.get('last_uploaded_file'):
+                    st.session_state.last_uploaded_file = uploaded_file
+                    with st.spinner("Processing uploaded audio..."):
+                        is_valid, error_msg, audio_data, sample_rate = validate_uploaded_audio(uploaded_file)
+                        if is_valid:
+                            reset_state()
+                            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
+                                tmp_file.write(uploaded_file.getvalue())
+                                temp_path = tmp_file.name
+                            st.session_state.audio = {
+                                "data": audio_data,
+                                "sample_rate": sample_rate,
+                                "format": "audio/wav"
+                            }
+                            st.session_state.current_audio_type = "uploaded"
+                            st.session_state.has_audio = True
+                            st.session_state.audio_path = temp_path
+                            st.session_state.option_selected = None
+                            st.session_state.recording = False
+                            duration = len(audio_data) / sample_rate
+                            st.success(f"✅ Audio uploaded successfully! Duration: {duration:.1f}s")
+                        else:
+                            st.error(f"❌ {error_msg}")
     if st.session_state.has_audio:
         st.audio(**st.session_state.audio)
+    st.markdown("### Model Selection")
+    col_model1, col_model2 = st.columns(2)
+    models_list = ["Random", "Ori Apex", "Ori Apex XT", "deepgram", "Ori Swift", "Ori Prime", "azure"]
+    with col_model1:
+        st.selectbox(
+            "Model 1:",
+            options=models_list,
+            index=0,
+            key="model_1_selection"
+        )
+    with col_model2:
+        st.selectbox(
+            "Model 2:",
+            options=models_list,
+            index=0,
+            key="model_2_selection"
+        )
     with st.container():
         st.button("📝 Transcribe Audio",on_click=on_click_transcribe,use_container_width=True,key="transcribe_btn",disabled=st.session_state.recording)
     INSTR = """
     ## Instructions:
+    * Record audio to recognise speech, upload an audio file, or press 🎲 for random Audio.
+    * Optionally select specific models using the Model 1 and Model 2 dropdowns (default is Random).
     * Click on transcribe audio button to commence the transcription process.
     * Read the two options one after the other while listening to the audio.
     * Vote on which transcript you prefer.
         * Currently Hindi and English are supported, and
             the results for Hindi will be in Hinglish (Hindi in Latin script)
         * It may take up to 30 seconds for speech recognition in some cases.
+        * Uploaded audio files must be .wav, .mp3, or .flac format and under 30 seconds duration.
     """.strip()
     st.markdown(INSTR)

enums.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import os
 SAVE_PATH = f"s3://{os.getenv('AWS_BUCKET_NAME')}/{os.getenv('RESULTS_KEY')}"
 ELO_JSON_PATH = f"s3://{os.getenv('AWS_BUCKET_NAME')}/{os.getenv('ELO_JSON_PATH')}"

 import os
+from dotenv import load_dotenv
+load_dotenv()
 SAVE_PATH = f"s3://{os.getenv('AWS_BUCKET_NAME')}/{os.getenv('RESULTS_KEY')}"
 ELO_JSON_PATH = f"s3://{os.getenv('AWS_BUCKET_NAME')}/{os.getenv('ELO_JSON_PATH')}"

logger.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import logging
 import os
 loglevel = os.getenv("LOGLEVEL", "INFO")

 import logging
 import os
+from dotenv import load_dotenv
+load_dotenv()
 loglevel = os.getenv("LOGLEVEL", "INFO")

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ streamlit==1.40.2
 fsspec==2024.10.0
 boto3
 s3fs
-torchaudio

 fsspec==2024.10.0
 boto3
 s3fs
+torchaudio
+python-dotenv

utils.py CHANGED Viewed

@@ -2,6 +2,9 @@ import fsspec
 import boto3
 import os
 import re
 fs = fsspec.filesystem(
         's3',

 import boto3
 import os
 import re
+from dotenv import load_dotenv
+load_dotenv()
 fs = fsspec.filesystem(
         's3',