Spaces:

litagin
/

anime-whisper-demo

Running on Zero

litagin commited on Oct 13, 2024

Commit

5331478

1 Parent(s): a0b4d2a

Update docs

Files changed (2) hide show

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Galgame Whisper Wip Demo
-emoji: 🏃
 colorFrom: blue
 colorTo: pink
 sdk: gradio

 ---
+title: Galgame Whisper (WIP) Demo
+emoji: 🥰🎤📝
 colorFrom: blue
 colorTo: pink
 sdk: gradio

app.py CHANGED Viewed

@@ -46,7 +46,9 @@ logger.success("Pipelines initialized!")
 @spaces.GPU
 def transcribe_common(audio: str, model: str) -> tuple[str, float]:
-    logger.info(f"Transcribing {Path(audio).name} with {model}")
     # Read and resample audio to 16kHz
     y, sr = librosa.load(audio, mono=True, sr=16000)
     # Get duration of audio
@@ -57,8 +59,7 @@ def transcribe_common(audio: str, model: str) -> tuple[str, float]:
     start_time = time.time()
     result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"]
     end_time = time.time()
-    logger.success(f"Transcribed {audio} with {model} in {end_time - start_time:.2f}s")
-    logger.success(f"Result:\n{result}")
     return result, end_time - start_time
@@ -94,11 +95,12 @@ def transcribe_galgame_whisper(audio) -> tuple[str, float]:
 initial_md = """
 # Galgame-Whisper (WIP) Demo
 - https://huggingface.co/litagin/galgame-whisper-wip
 - 日本語のみ対応
 - 比較できるように他モデルもついでに試せる
-- 現在0.1エポックくらい
-- 音声は15秒まで
 pipeに渡しているkwargsは以下の通り:
 ```python
@@ -106,7 +108,7 @@ generate_kwargs = {
     "language": "Japanese",
     "do_sample": False,
     "num_beams": 1,
-    "no_repeat_ngram_size": 3,
 }
 ```
 """

 @spaces.GPU
 def transcribe_common(audio: str, model: str) -> tuple[str, float]:
+    filename = Path(audio).name
+    logger.info(f"Model: {model}")
+    logger.info(f"Audio: {filename}")
     # Read and resample audio to 16kHz
     y, sr = librosa.load(audio, mono=True, sr=16000)
     # Get duration of audio
     start_time = time.time()
     result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"]
     end_time = time.time()
+    logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
     return result, end_time - start_time
 initial_md = """
 # Galgame-Whisper (WIP) Demo
+- 音声認識モデル [kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) をファインチューンした**未完成のモデル**のお試し
 - https://huggingface.co/litagin/galgame-whisper-wip
+- 現在0.1エポックくらい
 - 日本語のみ対応
+- デモでは音声は15秒まで
 - 比較できるように他モデルもついでに試せる
 pipeに渡しているkwargsは以下の通り:
 ```python
     "language": "Japanese",
     "do_sample": False,
     "num_beams": 1,
+    "no_repeat_ngram_size": 3,  # 3回以上の繰り返しを防ぐ
 }
 ```
 """