Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

App Files Files Community

tahirsher commited on Mar 10

Commit

76c5c38

verified ·

1 Parent(s): a4a8364

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -83

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch
 import torchaudio
 import numpy as np
 import streamlit as st
-import matplotlib.pyplot as plt
 from huggingface_hub import login
 from transformers import (
     AutoProcessor,
@@ -13,62 +12,50 @@ from transformers import (
 from cryptography.fernet import Fernet
 # ================================
-# 1️⃣ Authenticate with Hugging Face Hub
 # ================================
-HF_TOKEN = os.getenv("hf_token")
-if HF_TOKEN is None:
-    raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
-login(token=HF_TOKEN)
 # ================================
-# 2️⃣ Load Model & Processor
 # ================================
-MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
-processor = AutoProcessor.from_pretrained(MODEL_NAME)
-model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-print(f"✅ Model loaded on {device}")
 # ================================
-# 3️⃣ Load Dataset
 # ================================
-DATASET_TAR_PATH = "dev-clean.tar.gz"
-EXTRACT_PATH = "./librispeech_dev_clean"
-if not os.path.exists(EXTRACT_PATH):
-    print("🔄 Extracting dataset...")
-    with tarfile.open(DATASET_TAR_PATH, "r:gz") as tar:
-        tar.extractall(EXTRACT_PATH)
-    print("✅ Extraction complete.")
-else:
-    print("✅ Dataset already extracted.")
-AUDIO_FOLDER = os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
-def find_audio_files(base_folder):
-    audio_files = []
-    for root, _, files in os.walk(base_folder):
-        for file in files:
-            if file.endswith(".flac"):
-                audio_files.append(os.path.join(root, file))
-    return audio_files
-audio_files = find_audio_files(AUDIO_FOLDER)
-if not audio_files:
-    raise FileNotFoundError(f"❌ No .flac files found in {AUDIO_FOLDER}. Check dataset structure!")
-print(f"✅ Found {len(audio_files)} audio files in dataset!")
 # ================================
-# 4️⃣ Load Transcripts
 # ================================
 def load_transcripts():
-    transcript_dict = {}
     for root, _, files in os.walk(AUDIO_FOLDER):
         for file in files:
             if file.endswith(".txt"):
@@ -76,18 +63,13 @@ def load_transcripts():
                     for line in f:
                         parts = line.strip().split(" ", 1)
                         if len(parts) == 2:
-                            file_id, text = parts
-                            transcript_dict[file_id] = text
-    return transcript_dict
 transcripts = load_transcripts()
-if not transcripts:
-    raise FileNotFoundError("❌ No transcripts found! Check dataset structure.")
-print(f"✅ Loaded {len(transcripts)} transcripts.")
 # ================================
-# 5️⃣ Streamlit Sidebar: Fine-Tuning & Security
 # ================================
 st.sidebar.title("🔧 Fine-Tuning & Security Settings")
@@ -101,25 +83,21 @@ enable_encryption = st.sidebar.checkbox("🔒 Encrypt Transcription", value=True
 show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)
 # ================================
-# 6️⃣ Encryption Functionality
 # ================================
-def generate_key():
-    return Fernet.generate_key()
-def encrypt_text(text, key):
-    fernet = Fernet(key)
     return fernet.encrypt(text.encode())
-def decrypt_text(encrypted_text, key):
-    fernet = Fernet(key)
     return fernet.decrypt(encrypted_text).decode()
-encryption_key = generate_key()
 # ================================
-# 7️⃣ Streamlit ASR Web App
 # ================================
-st.title("🎙️ Speech-to-Text ASR Model Finetuned on Libri Speech Dataset with Security Features")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
@@ -130,48 +108,42 @@ if audio_file:
     waveform, sample_rate = torchaudio.load(audio_path)
     waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
-    waveform = waveform.to(dtype=torch.float32)
     # ================================
-    # ✅ Improved Adversarial Attack Handling
     # ================================
     noise = attack_strength * torch.randn_like(waveform)
-    # Apply noise but then perform denoising to counteract attack effects
     adversarial_waveform = waveform + noise
     adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
     denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)
-    input_features = processor(denoised_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)
     with torch.inference_mode():
-        generated_ids = model.generate(
-            input_features,
-            max_length=200,
-            num_beams=2,
-            do_sample=False,
-            use_cache=True,
-            attention_mask=torch.ones(input_features.shape, dtype=torch.long).to(device),
-            language="en"
-        )
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     if attack_strength > 0.3:
-        st.warning("⚠️ Adversarial attack detected! Mitigated using denoising.")
     # ================================
-    # ✅ Encryption Handling
     # ================================
     if enable_encryption:
-        encrypted_transcription = encrypt_text(transcription, encryption_key)
-        st.info("🔒 Transcription is encrypted. To view, enable 'Show Transcription' in the sidebar.")
         if show_transcription:
-            decrypted_text = decrypt_text(encrypted_transcription, encryption_key)
             st.success("📄 Secure Transcription:")
             st.write(decrypted_text)
         else:
-            st.write("🔒 [Encrypted] Transcription is hidden. Enable 'Show Transcription' to view.")
     else:
         st.success("📄 Transcription:")
         st.write(transcription)

 import torchaudio
 import numpy as np
 import streamlit as st
 from huggingface_hub import login
 from transformers import (
     AutoProcessor,
 from cryptography.fernet import Fernet
 # ================================
+# 1️⃣ Authenticate with Hugging Face Hub (Cache to prevent re-authentication)
 # ================================
+@st.cache_resource
+def authenticate_hf():
+    HF_TOKEN = os.getenv("hf_token")
+    if HF_TOKEN is None:
+        raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
+    login(token=HF_TOKEN)
+authenticate_hf()
 # ================================
+# 2️⃣ Load Model & Processor (Cached)
 # ================================
+@st.cache_resource
+def load_model():
+    MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
+    processor = AutoProcessor.from_pretrained(MODEL_NAME)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
+    return processor, model
+processor, model = load_model()
 # ================================
+# 3️⃣ Dataset Extraction (Cached)
 # ================================
+@st.cache_resource
+def extract_dataset():
+    DATASET_TAR_PATH = "dev-clean.tar.gz"
+    EXTRACT_PATH = "./librispeech_dev_clean"
+    if not os.path.exists(EXTRACT_PATH):
+        with tarfile.open(DATASET_TAR_PATH, "r:gz") as tar:
+            tar.extractall(EXTRACT_PATH)
+    return os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
+AUDIO_FOLDER = extract_dataset()
 # ================================
+# 4️⃣ Load Transcripts (Cached)
 # ================================
+@st.cache_resource
 def load_transcripts():
+    transcripts = {}
     for root, _, files in os.walk(AUDIO_FOLDER):
         for file in files:
             if file.endswith(".txt"):
                     for line in f:
                         parts = line.strip().split(" ", 1)
                         if len(parts) == 2:
+                            transcripts[parts[0]] = parts[1]
+    return transcripts
 transcripts = load_transcripts()
 # ================================
+# 5️⃣ Streamlit Sidebar for Fine-Tuning & Security
 # ================================
 st.sidebar.title("🔧 Fine-Tuning & Security Settings")
 show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)
 # ================================
+# 6️⃣ Encryption Handling (Precomputed Key)
 # ================================
+encryption_key = Fernet.generate_key()
+fernet = Fernet(encryption_key)
+def encrypt_text(text):
     return fernet.encrypt(text.encode())
+def decrypt_text(encrypted_text):
     return fernet.decrypt(encrypted_text).decode()
 # ================================
+# 7️⃣ Optimized ASR Web App
 # ================================
+st.title("🎙️ Speech-to-Text ASR Model Finetuned on Librispeech Corpus with Security Features")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
     waveform, sample_rate = torchaudio.load(audio_path)
     waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
     # ================================
+    # ✅ Optimized Adversarial Attack Handling
     # ================================
     noise = attack_strength * torch.randn_like(waveform)
     adversarial_waveform = waveform + noise
     adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
+    # Remove background noise for speed & accuracy
     denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)
+    # ================================
+    # ✅ Fast Transcription Processing
+    # ================================
+    input_features = processor(denoised_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to("cuda" if torch.cuda.is_available() else "cpu")
     with torch.inference_mode():
+        generated_ids = model.generate(input_features, max_length=200, num_beams=2, do_sample=False)
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     if attack_strength > 0.3:
+        st.warning("⚠️ Adversarial attack detected! Denoising applied.")
     # ================================
+    # ✅ Optimized Encryption Handling
     # ================================
     if enable_encryption:
+        encrypted_transcription = encrypt_text(transcription)
+        st.info("🔒 Transcription is encrypted. Enable 'Show Transcription' to view.")
         if show_transcription:
+            decrypted_text = decrypt_text(encrypted_transcription)
             st.success("📄 Secure Transcription:")
             st.write(decrypted_text)
         else:
+            st.write("🔒 [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
     else:
         st.success("📄 Transcription:")
         st.write(transcription)