Spaces:
Sleeping
Sleeping
bug fixed
Browse files- README.md +1 -1
- config/cli/yaoyin_test.yaml +1 -0
- modules/asr.py +2 -7
- modules/melody.py +3 -2
- modules/svs/espnet.py +2 -2
- modules/utils/g2p.py +1 -0
- modules/utils/text_normalize.py +3 -2
- pipeline.py +2 -2
README.md
CHANGED
@@ -47,7 +47,7 @@ pip install -r requirements.txt
|
|
47 |
#### Example Usage
|
48 |
|
49 |
```bash
|
50 |
-
python cli.py --query_audio
|
51 |
```
|
52 |
|
53 |
#### Parameter Description
|
|
|
47 |
#### Example Usage
|
48 |
|
49 |
```bash
|
50 |
+
python cli.py --query_audio tests/audio/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
|
51 |
```
|
52 |
|
53 |
#### Parameter Description
|
config/cli/yaoyin_test.yaml
CHANGED
@@ -3,6 +3,7 @@ llm_model: google/gemma-2-2b
|
|
3 |
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
melody_source: sample-lyric-kising
|
5 |
language: mandarin
|
|
|
6 |
prompt_template_character: Yaoyin
|
7 |
speaker: 9
|
8 |
cache_dir: .cache
|
|
|
3 |
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
melody_source: sample-lyric-kising
|
5 |
language: mandarin
|
6 |
+
max_sentences: 1
|
7 |
prompt_template_character: Yaoyin
|
8 |
speaker: 9
|
9 |
cache_dir: .cache
|
modules/asr.py
CHANGED
@@ -57,10 +57,5 @@ class WhisperASR(AbstractASRModel):
|
|
57 |
|
58 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
|
59 |
if audio_sample_rate != 16000:
|
60 |
-
|
61 |
-
|
62 |
-
except Exception as e:
|
63 |
-
breakpoint()
|
64 |
-
print(f"Error resampling audio: {e}")
|
65 |
-
audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
|
66 |
-
return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")
|
|
|
57 |
|
58 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
|
59 |
if audio_sample_rate != 16000:
|
60 |
+
audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
|
61 |
+
return self.pipe(audio, generate_kwargs={"language": language}, return_timestamps=False).get("text", "")
|
|
|
|
|
|
|
|
|
|
modules/melody.py
CHANGED
@@ -109,9 +109,10 @@ class MelodyController:
|
|
109 |
if pitch == 0:
|
110 |
score.append((st, ed, ref_lyric, pitch))
|
111 |
elif ref_lyric in ["-", "——"] and align_type == "lyric":
|
112 |
-
score.append((st, ed,
|
113 |
-
text_idx += 1
|
114 |
else:
|
115 |
score.append((st, ed, text_list[text_idx], pitch))
|
116 |
text_idx += 1
|
|
|
|
|
117 |
return score
|
|
|
109 |
if pitch == 0:
|
110 |
score.append((st, ed, ref_lyric, pitch))
|
111 |
elif ref_lyric in ["-", "——"] and align_type == "lyric":
|
112 |
+
score.append((st, ed, "-", pitch))
|
|
|
113 |
else:
|
114 |
score.append((st, ed, text_list[text_idx], pitch))
|
115 |
text_idx += 1
|
116 |
+
if text_idx >= len(text_list):
|
117 |
+
break
|
118 |
return score
|
modules/svs/espnet.py
CHANGED
@@ -53,7 +53,7 @@ class ESPNetSVS(AbstractSVSModel):
|
|
53 |
phoneme_mappers = {}
|
54 |
return phoneme_mappers
|
55 |
|
56 |
-
def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
|
57 |
if language not in self.phoneme_mappers:
|
58 |
raise ValueError(f"Unsupported language: {language} for {self.model_id}")
|
59 |
phoneme_mapper = self.phoneme_mappers[language]
|
@@ -99,7 +99,7 @@ class ESPNetSVS(AbstractSVSModel):
|
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
-
self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
|
|
53 |
phoneme_mappers = {}
|
54 |
return phoneme_mappers
|
55 |
|
56 |
+
def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
|
57 |
if language not in self.phoneme_mappers:
|
58 |
raise ValueError(f"Unsupported language: {language} for {self.model_id}")
|
59 |
phoneme_mapper = self.phoneme_mappers[language]
|
|
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
+
self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
modules/utils/g2p.py
CHANGED
@@ -32,6 +32,7 @@ for plan in ace_phonemes_all_plans["plans"]:
|
|
32 |
|
33 |
|
34 |
def preprocess_text(text: str, language: str) -> list[str]:
|
|
|
35 |
if language == "mandarin":
|
36 |
text_list = to_pinyin(text)
|
37 |
elif language == "japanese":
|
|
|
32 |
|
33 |
|
34 |
def preprocess_text(text: str, language: str) -> list[str]:
|
35 |
+
text = text.replace(" ", "")
|
36 |
if language == "mandarin":
|
37 |
text_list = to_pinyin(text)
|
38 |
elif language == "japanese":
|
modules/utils/text_normalize.py
CHANGED
@@ -3,12 +3,13 @@ from typing import Optional
|
|
3 |
|
4 |
|
5 |
def remove_non_zh_jp(text: str) -> str:
|
6 |
-
pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
|
7 |
return re.sub(pattern, "", text)
|
8 |
|
9 |
|
10 |
def truncate_sentences(text: str, max_sentences: int) -> str:
|
11 |
-
sentences = re.split(r"(?<=[
|
|
|
12 |
return "".join(sentences[:max_sentences]).strip()
|
13 |
|
14 |
|
|
|
3 |
|
4 |
|
5 |
def remove_non_zh_jp(text: str) -> str:
|
6 |
+
pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef\s]"
|
7 |
return re.sub(pattern, "", text)
|
8 |
|
9 |
|
10 |
def truncate_sentences(text: str, max_sentences: int) -> str:
|
11 |
+
sentences = re.split(r"(?<=[。!?!?~])|(?:\n+)|(?: {2,})", text)
|
12 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
13 |
return "".join(sentences[:max_sentences]).strip()
|
14 |
|
15 |
|
pipeline.py
CHANGED
@@ -29,6 +29,7 @@ class SingingDialoguePipeline:
|
|
29 |
self.melody_controller = MelodyController(
|
30 |
config["melody_source"], self.cache_dir
|
31 |
)
|
|
|
32 |
self.track_latency = config.get("track_latency", False)
|
33 |
self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
|
34 |
|
@@ -75,8 +76,7 @@ class SingingDialoguePipeline:
|
|
75 |
if self.track_latency:
|
76 |
llm_end_time = time.time()
|
77 |
llm_latency = llm_end_time - llm_start_time
|
78 |
-
|
79 |
-
llm_response = clean_llm_output(output, language=language)
|
80 |
score = self.melody_controller.generate_score(llm_response, language)
|
81 |
if self.track_latency:
|
82 |
svs_start_time = time.time()
|
|
|
29 |
self.melody_controller = MelodyController(
|
30 |
config["melody_source"], self.cache_dir
|
31 |
)
|
32 |
+
self.max_sentences = config.get("max_sentences", 2)
|
33 |
self.track_latency = config.get("track_latency", False)
|
34 |
self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
|
35 |
|
|
|
76 |
if self.track_latency:
|
77 |
llm_end_time = time.time()
|
78 |
llm_latency = llm_end_time - llm_start_time
|
79 |
+
llm_response = clean_llm_output(output, language=language, max_sentences=self.max_sentences)
|
|
|
80 |
score = self.melody_controller.generate_score(llm_response, language)
|
81 |
if self.track_latency:
|
82 |
svs_start_time = time.time()
|