Spaces:

sofdog
/

live-transcription-english

Sleeping

App Files Files Community

Sofia Casadei commited on May 26

Commit

5c44b80

1 Parent(s): edfee48

up

Browse files

Files changed (1) hide show

main.py +50 -18

main.py CHANGED Viewed

@@ -42,30 +42,55 @@ MODEL_ID = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
 LANGUAGE = os.getenv("LANGUAGE", "english").lower()
 device = get_device(force_cpu=False)
 torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
 logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
-attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
-logger.info(f"Using attention: {attention}")
 logger.info(f"Loading Whisper model: {MODEL_ID}")
 logger.info(f"Using language: {LANGUAGE}")
 try:
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch_dtype,
-        low_cpu_mem_usage=True,
         use_safetensors=True,
-        attn_implementation=attention,
-        device_map="auto" if device == "cuda" else None
     )
-    #model.to(device)
-except Exception as e:
-    logger.error(f"Error loading ASR model: {e}")
-    logger.error(f"Are you providing a valid model ID? {MODEL_ID}")
-    raise
 processor = AutoProcessor.from_pretrained(MODEL_ID)
@@ -74,15 +99,22 @@ transcribe_pipeline = pipeline(
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    torch_dtype=torch_dtype,
-    #device=device,
 )
-#if device == "cuda":
-#    transcribe_pipeline.model = torch.compile(transcribe_pipeline.model, mode="max-autotune")
 # Warm up the model with empty audio
 logger.info("Warming up Whisper model with dummy input")
-warmup_audio = np.zeros((16000,), dtype=np_dtype)  # 1s of silence
 transcribe_pipeline(warmup_audio)
 logger.info("Model warmup complete")

 LANGUAGE = os.getenv("LANGUAGE", "english").lower()
 device = get_device(force_cpu=False)
+use_device_map = True if device == "cuda" else False
+try_compile_model = True if device == "cuda" or (device == "mps" and torch.__version__ >= "2.7.0") else False
+try_use_flash_attention = True if device == "cuda" and is_flash_attn_2_available() else False
 torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
 logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
 logger.info(f"Loading Whisper model: {MODEL_ID}")
 logger.info(f"Using language: {LANGUAGE}")
+# Initialize the model (use flash attention on cuda if possible)
 try:
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True,
         use_safetensors=True,
+        attn_implementation="flash_attention_2" if try_use_flash_attention else "sdpa",
+        device_map="auto" if use_device_map else None,
     )
+    if not use_device_map:
+        model.to(device)
+except RuntimeError as e:
+    try:
+        logger.warning("Falling back to device_map=None")
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+            attn_implementation="flash_attention_2" if try_use_flash_attention else "sdpa",
+            device_map=None,
+        )
+        model.to(device)
+    except RuntimeError as e:
+        try:
+            logger.warning("Disabling flash attention")
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                MODEL_ID,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+                use_safetensors=True,
+                attn_implementation="sdpa",
+            )
+            model.to(device)
+        except Exception as e:
+            logger.error(f"Error loading ASR model: {e}")
+            logger.error(f"Are you providing a valid model ID? {MODEL_ID}")
+            raise
 processor = AutoProcessor.from_pretrained(MODEL_ID)
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype
 )
+# Try to compile the model
+try:
+    if try_compile_model:
+        transcribe_pipeline.model = torch.compile(transcribe_pipeline.model, mode="max-autotune")
+    else:
+        logger.warning("Proceeding without compiling the model (requirements not met)")
+except Exception as e:
+    logger.warning(f"Error compiling model: {e}")
+    logger.warning("Proceeding without compiling the model")
 # Warm up the model with empty audio
 logger.info("Warming up Whisper model with dummy input")
+warmup_audio = np.random.rand(16000).astype(np_dtype)
 transcribe_pipeline(warmup_audio)
 logger.info("Model warmup complete")