Spaces:

jhansss
/

SingingSDS

Sleeping

jhansss commited on Jul 1

Commit

7a23964

1 Parent(s): 79ad7df

bug fixed

Files changed (8) hide show

README.md CHANGED Viewed

@@ -47,7 +47,7 @@ pip install -r requirements.txt
 #### Example Usage
 ```bash
-python cli.py --query_audio data/query/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
 ```
 #### Parameter Description

 #### Example Usage
 ```bash
+python cli.py --query_audio tests/audio/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
 ```
 #### Parameter Description

config/cli/yaoyin_test.yaml CHANGED Viewed

@@ -3,6 +3,7 @@ llm_model: google/gemma-2-2b
 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin
 prompt_template_character: Yaoyin
 speaker: 9
 cache_dir: .cache

 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin
+max_sentences: 1
 prompt_template_character: Yaoyin
 speaker: 9
 cache_dir: .cache

modules/asr.py CHANGED Viewed

@@ -57,10 +57,5 @@ class WhisperASR(AbstractASRModel):
     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
         if audio_sample_rate != 16000:
-            try:
-                audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
-            except Exception as e:
-                breakpoint()
-                print(f"Error resampling audio: {e}")
-                audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
-        return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")

     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
         if audio_sample_rate != 16000:
+            audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
+        return self.pipe(audio, generate_kwargs={"language": language}, return_timestamps=False).get("text", "")

modules/melody.py CHANGED Viewed

@@ -109,9 +109,10 @@ class MelodyController:
                 if pitch == 0:
                     score.append((st, ed, ref_lyric, pitch))
                 elif ref_lyric in ["-", "——"] and align_type == "lyric":
-                    score.append((st, ed, ref_lyric, pitch))
-                    text_idx += 1
                 else:
                     score.append((st, ed, text_list[text_idx], pitch))
                     text_idx += 1
         return score

                 if pitch == 0:
                     score.append((st, ed, ref_lyric, pitch))
                 elif ref_lyric in ["-", "——"] and align_type == "lyric":
+                    score.append((st, ed, "-", pitch))
                 else:
                     score.append((st, ed, text_list[text_idx], pitch))
                     text_idx += 1
+                    if text_idx >= len(text_list):
+                        break
         return score

modules/svs/espnet.py CHANGED Viewed

@@ -53,7 +53,7 @@ class ESPNetSVS(AbstractSVSModel):
             phoneme_mappers = {}
         return phoneme_mappers
-    def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
@@ -99,7 +99,7 @@ class ESPNetSVS(AbstractSVSModel):
         return batch
     def synthesize(
-        self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":

             phoneme_mappers = {}
         return phoneme_mappers
+    def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
         return batch
     def synthesize(
+        self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":

modules/utils/g2p.py CHANGED Viewed

@@ -32,6 +32,7 @@ for plan in ace_phonemes_all_plans["plans"]:
 def preprocess_text(text: str, language: str) -> list[str]:
     if language == "mandarin":
         text_list = to_pinyin(text)
     elif language == "japanese":

 def preprocess_text(text: str, language: str) -> list[str]:
+    text = text.replace(" ", "")
     if language == "mandarin":
         text_list = to_pinyin(text)
     elif language == "japanese":

modules/utils/text_normalize.py CHANGED Viewed

@@ -3,12 +3,13 @@ from typing import Optional
 def remove_non_zh_jp(text: str) -> str:
-    pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
     return re.sub(pattern, "", text)
 def truncate_sentences(text: str, max_sentences: int) -> str:
-    sentences = re.split(r"(?<=[。！？])", text)
     return "".join(sentences[:max_sentences]).strip()

 def remove_non_zh_jp(text: str) -> str:
+    pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef\s]"
     return re.sub(pattern, "", text)
 def truncate_sentences(text: str, max_sentences: int) -> str:
+    sentences = re.split(r"(?<=[。！？!?~])|(?:\n+)|(?: {2,})", text)
+    sentences = [s.strip() for s in sentences if s.strip()]
     return "".join(sentences[:max_sentences]).strip()

pipeline.py CHANGED Viewed

@@ -29,6 +29,7 @@ class SingingDialoguePipeline:
         self.melody_controller = MelodyController(
             config["melody_source"], self.cache_dir
         )
         self.track_latency = config.get("track_latency", False)
         self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
@@ -75,8 +76,7 @@ class SingingDialoguePipeline:
         if self.track_latency:
             llm_end_time = time.time()
             llm_latency = llm_end_time - llm_start_time
-        print(f"llm output: {output}确认一下是不是不含prompt的")
-        llm_response = clean_llm_output(output, language=language)
         score = self.melody_controller.generate_score(llm_response, language)
         if self.track_latency:
             svs_start_time = time.time()

         self.melody_controller = MelodyController(
             config["melody_source"], self.cache_dir
         )
+        self.max_sentences = config.get("max_sentences", 2)
         self.track_latency = config.get("track_latency", False)
         self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
         if self.track_latency:
             llm_end_time = time.time()
             llm_latency = llm_end_time - llm_start_time
+        llm_response = clean_llm_output(output, language=language, max_sentences=self.max_sentences)
         score = self.melody_controller.generate_score(llm_response, language)
         if self.track_latency:
             svs_start_time = time.time()