jhansss commited on
Commit
7a23964
·
1 Parent(s): 79ad7df
README.md CHANGED
@@ -47,7 +47,7 @@ pip install -r requirements.txt
47
  #### Example Usage
48
 
49
  ```bash
50
- python cli.py --query_audio data/query/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
51
  ```
52
 
53
  #### Parameter Description
 
47
  #### Example Usage
48
 
49
  ```bash
50
+ python cli.py --query_audio tests/audio/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
51
  ```
52
 
53
  #### Parameter Description
config/cli/yaoyin_test.yaml CHANGED
@@ -3,6 +3,7 @@ llm_model: google/gemma-2-2b
3
  svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
  melody_source: sample-lyric-kising
5
  language: mandarin
 
6
  prompt_template_character: Yaoyin
7
  speaker: 9
8
  cache_dir: .cache
 
3
  svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
  melody_source: sample-lyric-kising
5
  language: mandarin
6
+ max_sentences: 1
7
  prompt_template_character: Yaoyin
8
  speaker: 9
9
  cache_dir: .cache
modules/asr.py CHANGED
@@ -57,10 +57,5 @@ class WhisperASR(AbstractASRModel):
57
 
58
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
59
  if audio_sample_rate != 16000:
60
- try:
61
- audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
62
- except Exception as e:
63
- breakpoint()
64
- print(f"Error resampling audio: {e}")
65
- audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
66
- return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")
 
57
 
58
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
59
  if audio_sample_rate != 16000:
60
+ audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
61
+ return self.pipe(audio, generate_kwargs={"language": language}, return_timestamps=False).get("text", "")
 
 
 
 
 
modules/melody.py CHANGED
@@ -109,9 +109,10 @@ class MelodyController:
109
  if pitch == 0:
110
  score.append((st, ed, ref_lyric, pitch))
111
  elif ref_lyric in ["-", "——"] and align_type == "lyric":
112
- score.append((st, ed, ref_lyric, pitch))
113
- text_idx += 1
114
  else:
115
  score.append((st, ed, text_list[text_idx], pitch))
116
  text_idx += 1
 
 
117
  return score
 
109
  if pitch == 0:
110
  score.append((st, ed, ref_lyric, pitch))
111
  elif ref_lyric in ["-", "——"] and align_type == "lyric":
112
+ score.append((st, ed, "-", pitch))
 
113
  else:
114
  score.append((st, ed, text_list[text_idx], pitch))
115
  text_idx += 1
116
+ if text_idx >= len(text_list):
117
+ break
118
  return score
modules/svs/espnet.py CHANGED
@@ -53,7 +53,7 @@ class ESPNetSVS(AbstractSVSModel):
53
  phoneme_mappers = {}
54
  return phoneme_mappers
55
 
56
- def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
57
  if language not in self.phoneme_mappers:
58
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
59
  phoneme_mapper = self.phoneme_mappers[language]
@@ -99,7 +99,7 @@ class ESPNetSVS(AbstractSVSModel):
99
  return batch
100
 
101
  def synthesize(
102
- self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
 
53
  phoneme_mappers = {}
54
  return phoneme_mappers
55
 
56
+ def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
57
  if language not in self.phoneme_mappers:
58
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
59
  phoneme_mapper = self.phoneme_mappers[language]
 
99
  return batch
100
 
101
  def synthesize(
102
+ self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
modules/utils/g2p.py CHANGED
@@ -32,6 +32,7 @@ for plan in ace_phonemes_all_plans["plans"]:
32
 
33
 
34
  def preprocess_text(text: str, language: str) -> list[str]:
 
35
  if language == "mandarin":
36
  text_list = to_pinyin(text)
37
  elif language == "japanese":
 
32
 
33
 
34
  def preprocess_text(text: str, language: str) -> list[str]:
35
+ text = text.replace(" ", "")
36
  if language == "mandarin":
37
  text_list = to_pinyin(text)
38
  elif language == "japanese":
modules/utils/text_normalize.py CHANGED
@@ -3,12 +3,13 @@ from typing import Optional
3
 
4
 
5
  def remove_non_zh_jp(text: str) -> str:
6
- pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
7
  return re.sub(pattern, "", text)
8
 
9
 
10
  def truncate_sentences(text: str, max_sentences: int) -> str:
11
- sentences = re.split(r"(?<=[。!?])", text)
 
12
  return "".join(sentences[:max_sentences]).strip()
13
 
14
 
 
3
 
4
 
5
  def remove_non_zh_jp(text: str) -> str:
6
+ pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef\s]"
7
  return re.sub(pattern, "", text)
8
 
9
 
10
  def truncate_sentences(text: str, max_sentences: int) -> str:
11
+ sentences = re.split(r"(?<=[。!?!?~])|(?:\n+)|(?: {2,})", text)
12
+ sentences = [s.strip() for s in sentences if s.strip()]
13
  return "".join(sentences[:max_sentences]).strip()
14
 
15
 
pipeline.py CHANGED
@@ -29,6 +29,7 @@ class SingingDialoguePipeline:
29
  self.melody_controller = MelodyController(
30
  config["melody_source"], self.cache_dir
31
  )
 
32
  self.track_latency = config.get("track_latency", False)
33
  self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
34
 
@@ -75,8 +76,7 @@ class SingingDialoguePipeline:
75
  if self.track_latency:
76
  llm_end_time = time.time()
77
  llm_latency = llm_end_time - llm_start_time
78
- print(f"llm output: {output}确认一下是不是不含prompt的")
79
- llm_response = clean_llm_output(output, language=language)
80
  score = self.melody_controller.generate_score(llm_response, language)
81
  if self.track_latency:
82
  svs_start_time = time.time()
 
29
  self.melody_controller = MelodyController(
30
  config["melody_source"], self.cache_dir
31
  )
32
+ self.max_sentences = config.get("max_sentences", 2)
33
  self.track_latency = config.get("track_latency", False)
34
  self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
35
 
 
76
  if self.track_latency:
77
  llm_end_time = time.time()
78
  llm_latency = llm_end_time - llm_start_time
79
+ llm_response = clean_llm_output(output, language=language, max_sentences=self.max_sentences)
 
80
  score = self.melody_controller.generate_score(llm_response, language)
81
  if self.track_latency:
82
  svs_start_time = time.time()