Spaces:

litagin
/

anime-whisper-demo

Running on Zero

litagin commited on Nov 13, 2024

Commit

e8dc53b

1 Parent(s): 5f43bee

update

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import warnings
 from pathlib import Path
 import gradio as gr
-import huggingface_hub
 import librosa
 import spaces
 import torch
@@ -13,7 +12,6 @@ from transformers import pipeline
 warnings.filterwarnings("ignore")
-huggingface_hub.login(token=os.getenv("HF_TOKEN"))
 is_hf = os.getenv("SYSTEM") == "spaces"
 generate_kwargs = {
@@ -62,6 +60,7 @@ def transcribe_common(audio: str, model: str) -> str:
         audio = AudioSegment.from_file(audio)
         audio.export("temp.wav", format="wav")
         y, sr = librosa.load("temp.wav", mono=True, sr=16000)
     # Get duration of audio
     duration = librosa.get_duration(y=y, sr=sr)
     logger.info(f"Duration: {duration:.2f}s")
@@ -88,9 +87,8 @@ def transcribe_anime_whisper(audio) -> str:
 initial_md = """
 # Anime-Whisper Demo
-[**Anime Whisper**](https://huggingface.co/litagin/anime-whisper): 5千時間以上のアニメ調セリフと台本でファインチューニングされた音声認識モデルです。
-- ベースモデル: [kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0)
 - デモでは**音声は15秒まで**しか受け付けません
 - 日本語のみ対応 (Japanese only)
 - 比較のために [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています

 from pathlib import Path
 import gradio as gr
 import librosa
 import spaces
 import torch
 warnings.filterwarnings("ignore")
 is_hf = os.getenv("SYSTEM") == "spaces"
 generate_kwargs = {
         audio = AudioSegment.from_file(audio)
         audio.export("temp.wav", format="wav")
         y, sr = librosa.load("temp.wav", mono=True, sr=16000)
+        Path("temp.wav").unlink()
     # Get duration of audio
     duration = librosa.get_duration(y=y, sr=sr)
     logger.info(f"Duration: {duration:.2f}s")
 initial_md = """
 # Anime-Whisper Demo
+[**Anime Whisper**](https://huggingface.co/litagin/anime-whisper): 5千時間以上のアニメ調セリフと台本でファインチューニングされた日本語音声認識モデルのデモです。句読点や感嘆符がリズムや感情に合わせて自然に付き、NSFW含む非言語発話もうまく台本調に書き起こされます。
 - デモでは**音声は15秒まで**しか受け付けません
 - 日本語のみ対応 (Japanese only)
 - 比較のために [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています