Project Beatrice commited on
Commit
d65cea0
·
0 Parent(s):

Initial commit

Browse files
Files changed (9) hide show
  1. .gitattributes +2 -0
  2. .python-version +1 -0
  3. README.md +12 -0
  4. app.py +454 -0
  5. mecab-ipadic-neologd/COPYING +83 -0
  6. packages.txt +2 -0
  7. pyproject.toml +21 -0
  8. requirements.txt +8 -0
  9. uv.lock +0 -0
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.bin filter=lfs diff=lfs merge=lfs -text
2
+ *.dic filter=lfs diff=lfs merge=lfs -text
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Furigana From Speech And Text
3
+ emoji: 🏆
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.34.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: 音声と漢字仮名交じりテキストからふりがなを推定するツール
12
+ ---
app.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ASR with multiple reading candidates using MeCab and Sudachi.
4
+ This module provides functionality to generate and score multiple reading candidates
5
+ for Japanese text and use them in speech recognition.
6
+ """
7
+
8
+ import warnings
9
+
10
+ warnings.filterwarnings("ignore", category=FutureWarning)
11
+
12
+ import math
13
+ import tempfile
14
+ from pathlib import Path
15
+ from typing import List, Tuple, Dict
16
+
17
+ import fugashi
18
+ import gradio as gr
19
+ import librosa
20
+ import numpy as np
21
+ import torch
22
+ from espnet.nets.scorer_interface import BatchScorerInterface
23
+ from espnet2.bin.asr_inference import Speech2Text
24
+
25
+
26
+ # MeCab辞書の設定
27
+ MECAB_DIC_DIR = str(Path(__file__).parent / "mecab-ipadic-neologd")
28
+ AUDIO_FILES_DIR = Path(".")
29
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
30
+
31
+
32
+ class MecabCandidateGenerator:
33
+ """Generate reading candidates using MeCab with mecab-ipadic-neologd dictionary."""
34
+
35
+ KATAKANA_LIST = set(
36
+ "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヽヾー・"
37
+ )
38
+
39
+ def __init__(self):
40
+ with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp:
41
+ tmp.write(f"""dicdir = {MECAB_DIC_DIR}\n""")
42
+ tmp.flush()
43
+ self.tagger = fugashi.GenericTagger(
44
+ f"-r {tmp.name} -d {MECAB_DIC_DIR} -Odump"
45
+ )
46
+ self.tagger_p = fugashi.GenericTagger(
47
+ f"-r {tmp.name} -d {MECAB_DIC_DIR} -p -F 0 -E %pc"
48
+ )
49
+
50
+ def _normalize_text(self, text: str) -> str:
51
+ """Normalize full-width numbers and punctuation to half-width."""
52
+ # Full-width to half-width mapping
53
+ replacements = {
54
+ # Numbers
55
+ "0": "0",
56
+ "1": "1",
57
+ "2": "2",
58
+ "3": "3",
59
+ "4": "4",
60
+ "5": "5",
61
+ "6": "6",
62
+ "7": "7",
63
+ "8": "8",
64
+ "9": "9",
65
+ # Punctuation
66
+ "!": "!",
67
+ "?": "?",
68
+ ".": ".",
69
+ ",": ",",
70
+ }
71
+
72
+ for full, half in replacements.items():
73
+ text = text.replace(full, half)
74
+
75
+ return text
76
+
77
+ def generate_candidates(self, text: str, nbest: int = 512) -> List[List[List[str]]]:
78
+ """Generate n-best morphological analysis candidates."""
79
+ # Normalize numbers and punctuation to half-width
80
+ text = self._normalize_text(text)
81
+ res = self.tagger.nbest(text, nbest)
82
+ candidates = []
83
+ candidate = []
84
+ for line in res.split("\n"):
85
+ if not line.strip():
86
+ continue
87
+ fields = line.split()
88
+ word = fields[1]
89
+ if word == "BOS":
90
+ pass
91
+ elif word == "EOS":
92
+ candidates.append(candidate)
93
+ candidate = []
94
+ else:
95
+ candidate.append(fields)
96
+ return candidates
97
+
98
+ def candidate_to_score(self, candidate: List[List[str]]) -> Tuple[float, List[str]]:
99
+ """Calculate score for a morphological analysis candidate."""
100
+ query = []
101
+ for morph in candidate:
102
+ assert len(morph) >= 3, (
103
+ f"Expected morph to have at least 3 fields, got {len(morph)}: {morph}"
104
+ )
105
+ _, original_form, features, *_ = morph
106
+ query.append(f"{original_form}\t{features}\n")
107
+ query = "".join(query) + "EOS"
108
+ result = self.tagger_p.parse(query)
109
+ result = int(result.lstrip("0"))
110
+ return result, []
111
+
112
+ def candidate_to_yomi(
113
+ self, candidate: List[List[str]], yomi_index: int = 7
114
+ ) -> Tuple[str, List[str]]:
115
+ """Convert morphological analysis candidate to reading (yomi)."""
116
+ result = ""
117
+ warning_messages = []
118
+ for morph in candidate:
119
+ warning_message = ""
120
+ if len(morph) < 2:
121
+ warning_message = f"[3] Morph has less than 2 fields: {morph[1:3]}"
122
+ warning_messages.append(warning_message)
123
+ continue
124
+ original_form = morph[1]
125
+ if len(morph) < 3:
126
+ warning_message = f"[3] Morph has less than 3 fields: {morph[1:3]}"
127
+ morph = morph + ["*"]
128
+ features = morph[2].split(",")
129
+
130
+ if len(features) <= yomi_index:
131
+ reading = ""
132
+ if all(
133
+ "ぁ" <= c <= "ん" or c in self.KATAKANA_LIST for c in original_form
134
+ ):
135
+ reading = original_form
136
+ else:
137
+ if not warning_message:
138
+ warning_level = (
139
+ 1
140
+ if all(
141
+ c in "、。!?!?「」『』【】〔〕[]〈〉《》・"
142
+ for c in original_form
143
+ )
144
+ else 3
145
+ )
146
+ warning_message = (
147
+ f"[{warning_level}] Morph has no reading: {morph[1:3]}"
148
+ )
149
+ warning_messages.append(warning_message)
150
+ continue
151
+ else:
152
+ reading = features[yomi_index]
153
+ i = 0
154
+ normalized_reading = ""
155
+ while i < len(reading):
156
+ char = reading[i]
157
+ if "ァ" <= char <= "ン" or char in "ヽヾ":
158
+ # カタカナをひらがなに変換
159
+ normalized_reading += chr(ord(char) - 96)
160
+ elif char == "ー" or "ぁ" <= char <= "ん":
161
+ normalized_reading += char
162
+ elif char == "ヴ":
163
+ if i + 1 < len(reading) and reading[i + 1] in "ァィェォ":
164
+ normalized_reading += "ばびべぼ"[
165
+ "ァィェォ".index(reading[i + 1])
166
+ ]
167
+ i += 1
168
+ else:
169
+ normalized_reading += "ぶ"
170
+ else:
171
+ warning_level = (
172
+ 1 if char in "、。!?!?「」『』【】〔〕[]〈〉《》・" else 3
173
+ )
174
+ warning_message = f"[{warning_level}] Unhandled character in reading: {morph[1:3]}"
175
+ warning_messages.append(warning_message)
176
+ i += 1
177
+ result += normalized_reading
178
+
179
+ return result, warning_messages
180
+
181
+
182
+ class CandidateScorer(BatchScorerInterface):
183
+ """Score ASR hypotheses based on allowed reading candidates."""
184
+
185
+ def __init__(self, token_list: List[str], device: str = "cpu"):
186
+ super().__init__()
187
+ self.token_list = token_list
188
+ self.eos_id = token_list.index("<sos/eos>")
189
+ self.device = device
190
+
191
+ def set_candidates(self, candidates: List[str]):
192
+ """Set the allowed reading candidates."""
193
+ self.candidates = candidates
194
+
195
+ def score(
196
+ self, y: torch.Tensor, state, x=None
197
+ ): # x is unused but required by interface
198
+ """
199
+ Score function for beam search.
200
+
201
+ Args:
202
+ y: prefix token sequence
203
+ state: scorer state (unused)
204
+ x: encoder feature (unused)
205
+
206
+ Returns:
207
+ scores: token scores
208
+ state: updated state
209
+ """
210
+ prefix = y.tolist()
211
+ assert prefix[0] == self.eos_id, prefix
212
+ prefix = [self.token_list[i] for i in prefix[1:]]
213
+ prefix = "".join(prefix)
214
+
215
+ allowed = []
216
+ for candidate in self.candidates:
217
+ if candidate.startswith(prefix):
218
+ remaining = candidate[len(prefix) :]
219
+ if remaining:
220
+ for i, token in enumerate(self.token_list):
221
+ if remaining.startswith(token):
222
+ allowed.append(i)
223
+ else:
224
+ allowed.append(self.eos_id)
225
+
226
+ allowed = list(set(allowed))
227
+ vocab = len(self.token_list)
228
+ scores = torch.full((vocab,), float("-inf"), device=self.device)
229
+ if allowed:
230
+ scores[allowed] = 0.0
231
+ else:
232
+ scores[self.eos_id] = -10000
233
+ return scores, state
234
+
235
+
236
+ class MultiReadingASR:
237
+ """ASR system with multiple reading candidates support."""
238
+
239
+ def __init__(self, device: str = DEVICE):
240
+ """Initialize ASR components and dictionaries."""
241
+ print("Initializing models and dictionaries...")
242
+
243
+ # Initialize MeCab
244
+ self.mecab_generator = MecabCandidateGenerator()
245
+
246
+ # Initialize ASR model
247
+ self.asr = self._setup_asr_model(device)
248
+
249
+ def _setup_asr_model(self, device: str = "cpu") -> Speech2Text:
250
+ """Setup and configure the ASR model."""
251
+ asr = Speech2Text.from_pretrained(
252
+ "reazon-research/reazonspeech-espnet-v2",
253
+ lm_weight=0,
254
+ device=device,
255
+ nbest=10,
256
+ normalize_length=True,
257
+ )
258
+
259
+ # Filter out non-hiragana tokens
260
+ allowed_tokens = set(
261
+ "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゝゞー"
262
+ )
263
+
264
+ assert len(asr.asr_model.decoder.output_layer.bias) == len(
265
+ asr.asr_model.token_list
266
+ )
267
+ assert len(asr.asr_model.ctc.ctc_lo.bias) == len(asr.asr_model.token_list)
268
+ for i, token in enumerate(asr.asr_model.token_list):
269
+ if len(token) == 1 and token not in allowed_tokens:
270
+ asr.asr_model.decoder.output_layer.bias.data[i] -= 100.0
271
+ asr.asr_model.ctc.ctc_lo.bias.data[i] -= 100.0
272
+
273
+ return asr
274
+
275
+ def process_audio_with_candidates(
276
+ self,
277
+ wav_file: Path,
278
+ text: str,
279
+ verbose: bool = False,
280
+ ) -> Dict:
281
+ """Process audio file with multiple reading candidates."""
282
+ # Load audio
283
+ wav, _ = librosa.load(wav_file, sr=16000, mono=True, dtype=np.float32)
284
+
285
+ # Get relative path from AUDIO_FILES_DIR
286
+ try:
287
+ relative_path = wav_file.relative_to(AUDIO_FILES_DIR)
288
+ file_path = str(relative_path)
289
+ except ValueError:
290
+ # Fallback to just filename if not under AUDIO_FILES_DIR
291
+ file_path = wav_file.name
292
+
293
+ results = {"file": file_path, "text": text, "readings": {}}
294
+
295
+ if verbose:
296
+ print(f"File: {wav_file.name}")
297
+ print(f"Text: {text}")
298
+
299
+ warning_messages = []
300
+
301
+ # MeCab candidates
302
+ mecab_candidates = self.mecab_generator.generate_candidates(text)
303
+ yomi_candidates = []
304
+ yomi_candidate_to_indices = {}
305
+
306
+ for i, c in enumerate(mecab_candidates):
307
+ yomi, warnings = self.mecab_generator.candidate_to_yomi(c)
308
+ warning_messages.extend(warnings)
309
+ yomi_candidates.append(yomi)
310
+ if yomi in yomi_candidate_to_indices:
311
+ yomi_candidate_to_indices[yomi].append(i)
312
+ else:
313
+ yomi_candidate_to_indices[yomi] = [i]
314
+
315
+ yomi_candidates = sorted(set(yomi_candidates))
316
+ warning_messages = sorted(set(warning_messages), key=lambda x: (-ord(x[1]), x))
317
+ results["yomi_candidates"] = yomi_candidates
318
+ results["warnings"] = warning_messages
319
+
320
+ if verbose:
321
+ print(f"Warning messages: {warning_messages}")
322
+
323
+ # ASR with candidates
324
+ scorer = CandidateScorer(self.asr.asr_model.token_list, device=self.asr.device)
325
+ scorer.set_candidates(yomi_candidates)
326
+ self.asr.maxlenratio = -max(map(len, yomi_candidates)) - 2
327
+ self.asr.beam_search.scorers["cand"] = scorer
328
+ self.asr.beam_search.full_scorers["cand"] = scorer
329
+ self.asr.beam_search.weights["cand"] = 1.0
330
+
331
+ transcription = self.asr(wav)
332
+ results["transcriptions"] = []
333
+
334
+ for trans, _, _, info in transcription:
335
+ if info.score == -math.inf:
336
+ continue
337
+ candidate_indices = yomi_candidate_to_indices.get(trans)
338
+ if candidate_indices is None:
339
+ mecab_score = math.inf
340
+ else:
341
+ mecab_score = min(
342
+ self.mecab_generator.candidate_to_score(mecab_candidates[i])[0]
343
+ for i in candidate_indices
344
+ )
345
+ mecab_scale = 800
346
+ mecab_score /= mecab_scale
347
+ normalized_score = info.score.item() / (len(info.yseq) - 1)
348
+
349
+ results["transcriptions"].append(
350
+ {
351
+ "text": trans,
352
+ "normalized_score": normalized_score,
353
+ "mecab_score": mecab_score,
354
+ }
355
+ )
356
+
357
+ if verbose:
358
+ print(
359
+ f"Transcription: {trans} {normalized_score:.2f} {info.score.item():.2f} {mecab_score:.2f}"
360
+ )
361
+
362
+ transcriptions = results["transcriptions"]
363
+ mean_transcription_length = sum(len(t["text"]) for t in transcriptions) / len(
364
+ transcriptions
365
+ )
366
+ for transcription in transcriptions:
367
+ adjusted_score = (
368
+ transcription["normalized_score"] * mean_transcription_length
369
+ )
370
+ mecab_score = transcription["mecab_score"]
371
+ transcription["ensembled_score"] = (
372
+ adjusted_score * 0.2528 + mecab_score * -0.4701
373
+ )
374
+ best_transcription = max(transcriptions, key=lambda t: t["ensembled_score"])
375
+ results["best_transcription"] = best_transcription
376
+ results["confidence"] = 1.0 / sum(
377
+ math.exp(t["ensembled_score"] - best_transcription["ensembled_score"])
378
+ for t in transcriptions
379
+ )
380
+
381
+ return results
382
+
383
+
384
+ # グローバルでASRシステムを初期化(初回ロード時間を短縮)
385
+ print("モデルを初期化しています...")
386
+ asr_system = MultiReadingASR()
387
+ print("初期化が完了しました。")
388
+
389
+
390
+ def process_audio_gradio(text, audio_file):
391
+ """Gradio用の処理関数"""
392
+ if not text or not audio_file:
393
+ return "テキストと音声ファイルの両方を入力してください。", "", ""
394
+
395
+ # 音声ファイルのパスを取得
396
+ wav_path = Path(audio_file)
397
+
398
+ # ASRで処理
399
+ results = asr_system.process_audio_with_candidates(
400
+ wav_file=wav_path, text=text, verbose=False
401
+ )
402
+
403
+ # 最良の読みを取得
404
+ best_reading = results["best_transcription"]["text"]
405
+
406
+ # 信頼度をパーセンテージに変換
407
+ confidence = f"{results['confidence'] * 100:.1f}%"
408
+
409
+ # 警告メッセージを整形
410
+ warnings = results.get("warnings", [])
411
+ if warnings:
412
+ warning_text = "\n".join(warnings)
413
+ else:
414
+ warning_text = "警告なし"
415
+
416
+ return best_reading, confidence, warning_text
417
+
418
+
419
+ # Gradio インターフェースの作成
420
+ with gr.Blocks(title="漢字仮名交じりテキストの読み推定") as demo:
421
+ gr.Markdown(
422
+ """
423
+ # 音声と漢字仮名交じりテキストからふりがなを推定するツール
424
+
425
+ 音声認識モデルと MeCab の合わせ技でふりがなを推定します。
426
+ """
427
+ )
428
+
429
+ with gr.Row():
430
+ with gr.Column():
431
+ text_input = gr.Textbox(
432
+ label="テキスト",
433
+ placeholder="読みを推定したいテキストを入力してください",
434
+ lines=2,
435
+ )
436
+ audio_input = gr.Audio(label="音声ファイル", type="filepath")
437
+ submit_btn = gr.Button("読みを推定", variant="primary")
438
+
439
+ with gr.Column():
440
+ reading_output = gr.Textbox(label="推定された読み(ひらがな)", lines=2)
441
+ confidence_output = gr.Textbox(label="信頼度")
442
+ warnings_output = gr.Textbox(label="警告メッセージ", lines=3)
443
+
444
+ # ボタンクリック時の処理
445
+ submit_btn.click(
446
+ fn=process_audio_gradio,
447
+ inputs=[text_input, audio_input],
448
+ outputs=[reading_output, confidence_output, warnings_output],
449
+ )
450
+
451
+
452
+ # Hugging Face Spaces用の起動
453
+ if __name__ == "__main__":
454
+ demo.launch()
mecab-ipadic-neologd/COPYING ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (C) 2015-2019 Toshinori Sato (@overlast)
2
+
3
+ https://github.com/neologd/mecab-ipadic-neologd
4
+
5
+ i. 本データは、株式会社はてなが提供するはてなキーワード一覧ファイル
6
+ 中の表記、及び、読み仮名の大半を使用している。
7
+
8
+ はてなキーワード一覧ファイルの著作権は、株式会社はてなにある。
9
+
10
+ はてなキーワード一覧ファイルの使用条件に基づき、また、
11
+ データ使用の許可を頂いたことに対する感謝の意を込めて、
12
+ 以下に株式会社はてなおよびはてなキーワードへの参照をURLで示す。
13
+
14
+ 株式会社はてな : http://hatenacorp.jp/information/outline
15
+
16
+ はてなキーワード :
17
+ http://developer.hatena.ne.jp/ja/documents/keyword/misc/catalog
18
+
19
+ ii. 本データは、日本郵便株式会社が提供する郵便番号データ中の表記、
20
+ 及び、読み仮名を使用している。
21
+
22
+ 日本郵便株式会社は、郵便番号データに限っては著作権を主張しないと
23
+ 述べている。
24
+
25
+ 日本郵便株式会社の郵便番号データに対する感謝の意を込めて、
26
+ 以下に日本郵便株式会社および郵便番号データへの参照をURLで示す。
27
+
28
+ 日本郵便株式会社 :
29
+ http://www.post.japanpost.jp/about/profile.html
30
+
31
+ 郵便番号データ :
32
+ http://www.post.japanpost.jp/zipcode/dl/readme.html
33
+
34
+ iii. 本データは、スナフキん氏が提供する日本全国駅名一覧中の表記、及び
35
+ 読み仮名を使用している。
36
+
37
+ 日本全国駅名一覧の著作権は、スナフキん氏にある。
38
+
39
+ スナフキん氏は 「このデータを利用されるのは自由ですが、その際に
40
+ 不利益を被ったりした場合でも、スナフキんは一切責任は負えません
41
+ ことをご承知おき下さい」と述べている。
42
+
43
+ スナフキん氏に対する感謝の意を込めて、
44
+ 以下に日本全国駅名一覧のコーナーへの参照をURLで示す。
45
+
46
+ 日本全国駅名一覧のコーナー :
47
+ http://www5a.biglobe.ne.jp/~harako/data/station.htm
48
+
49
+ iv. 本データは、工藤拓氏が提供する人名(姓/名)エントリデータ中の、
50
+ 漢字表記の姓・名とそれに対応する読み仮名を使用している。
51
+
52
+ 人名(姓/名)エントリデータは被災者・安否不明者の人名の
53
+ 表記揺れ対策として、Mozcの人名辞書を活用できるという
54
+ 工藤氏の考えによって提供されている。
55
+
56
+ 工藤氏に対する感謝の意を込めて、
57
+ 以下にデータ本体と経緯が分かる情報への参照をURLで示す。
58
+
59
+ 人名(姓/名)エントリデータ :
60
+ http://chasen.org/~taku/software/misc/personal_name.zip
61
+
62
+ 上記データが提供されることになった経緯
63
+ http://togetter.com/li/111529
64
+
65
+ v. 本データは、Web上からクロールした大量の文書データから抽出した
66
+ 表記とそれに対応する読み仮名のデータを含んでいる。
67
+
68
+ 抽出した表記とそれに対応する読み仮名の組は、上記の i. から iv.
69
+ の言語資源の組み合わせによって得られる組のみを採録した。
70
+
71
+ Web 上に文書データを公開して下さっている皆様に感謝いたします。
72
+
73
+ Licensed under the Apache License, Version 2.0 (the &quot;License&quot;);
74
+ you may not use this file except in compliance with the License.
75
+ You may obtain a copy of the License at
76
+
77
+ http://www.apache.org/licenses/LICENSE-2.0
78
+
79
+ Unless required by applicable law or agreed to in writing, software
80
+ distributed under the License is distributed on an &quot;AS IS&quot; BASIS,
81
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
82
+ See the License for the specific language governing permissions and
83
+ limitations under the License.
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ mecab
2
+ libmecab-dev
pyproject.toml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "furigana-from-speech-and-text"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "espnet>=202412",
9
+ "espnet-model-zoo>=0.1.7",
10
+ "fugashi>=1.5.1",
11
+ "gradio>=5.34.2",
12
+ "librosa>=0.9.2",
13
+ "numpy>=1.23.5",
14
+ "torch>=2.7.1",
15
+ "torchaudio>=2.7.1",
16
+ ]
17
+
18
+ [dependency-groups]
19
+ dev = [
20
+ "ruff>=0.12.0",
21
+ ]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet>=202412
2
+ espnet-model-zoo>=0.1.7
3
+ fugashi>=1.5.1
4
+ gradio>=5.34.2
5
+ librosa>=0.9.2
6
+ numpy>=1.23.5
7
+ torch>=2.7.1
8
+ torchaudio>=2.7.1
uv.lock ADDED
The diff for this file is too large to render. See raw diff