| import gradio as gr | |
| import librosa | |
| import torch | |
| from src import CandidateGenerator | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| candidate_generator = CandidateGenerator(device) | |
| def process_audio_text(audio_file, text, num_candidates): | |
| if audio_file is None or text.strip() == "": | |
| return "音声とテキストの両方が必要です。", "", "" | |
| audio_16khz, _ = librosa.load(audio_file, sr=16000) | |
| results = candidate_generator.generate(text, audio_16khz, int(num_candidates)) | |
| candidates_output = [] | |
| for candidate in results["candidates"]: | |
| candidates_output.append( | |
| f"mecab_cost: {candidate['mecab_cost']}, " | |
| f"ctc_loss: {candidate['ctc_loss']:.3f}, " | |
| f"phonemes: {' '.join(candidate['phonemes'])}" | |
| ) | |
| candidates_str = "\n".join(candidates_output) | |
| hubert_pred = results["hubert_prediction"] | |
| hubert_output = ( | |
| f"ctc_loss: {hubert_pred['ctc_loss']:.3f}, " | |
| f"phonemes: {' '.join(hubert_pred['phonemes'])}" | |
| ) | |
| debug_output = repr(results) | |
| return candidates_str, hubert_output, debug_output | |
| interface = gr.Interface( | |
| fn=process_audio_text, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="音声ファイル"), | |
| gr.Textbox( | |
| label="テキスト", placeholder="漢字仮名交じりのテキストを入力してください" | |
| ), | |
| gr.Slider(minimum=1, maximum=20, value=10, step=1, label="MeCab の候補数"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="候補", lines=10), | |
| gr.Textbox(label="HuBERT による予測", lines=1), | |
| gr.Textbox(label="全ての結果", lines=20), | |
| ], | |
| title="音声と漢字仮名交じりテキストからふりがなを推定するツール v2 (工事中)", | |
| description="音素認識モデルと MeCab による読みの推定を行います。", | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |