Spaces:

ngxson
/

kokoro-podcast-generator

Running

ngxson HF staff commited on 22 days ago

Commit

983ba85

1 Parent(s): 9eb519e

add comments on pipeline

Files changed (1) hide show

front/src/utils/pipeline.ts CHANGED Viewed

@@ -38,9 +38,10 @@ export const pipelineGeneratePodcast = async (
   let outputWav: AudioBuffer;
   const { speakerNames, turns } = podcast;
   for (const turn of turns) {
-    // normalize it
     turn.nextGapMilisecs =
       Math.max(-600, Math.min(300, turn.nextGapMilisecs)) - 100;
     turn.text = turn.text
       .trim()
       .replace(/’/g, "'")
@@ -49,6 +50,7 @@ export const pipelineGeneratePodcast = async (
   }
   const steps: GenerationStep[] = turns.map((turn) => ({ turn }));
   onUpdate(0, steps.length);
   for (let i = 0; i < steps.length; i++) {
     const step = steps[i];
     const speakerIdx = speakerNames.indexOf(step.turn.speakerName as string) as
@@ -60,9 +62,11 @@ export const pipelineGeneratePodcast = async (
     if (i === 0) {
       outputWav = step.audioBuffer;
       if (isAddIntroMusic) {
         const openingSound = await loadWavAndDecode(openingSoundSrc);
         outputWav = joinAudio(openingSound, outputWav!, -2000);
       } else {
         outputWav = addSilence(outputWav!, true, 200);
       }
     } else {
@@ -76,6 +80,7 @@ export const pipelineGeneratePodcast = async (
     onUpdate(i + 1, steps.length);
   }
   if (isAddNoise) {
     outputWav = addNoise(outputWav!, 0.002);
   }
   // @ts-expect-error this is fine

   let outputWav: AudioBuffer;
   const { speakerNames, turns } = podcast;
   for (const turn of turns) {
+    // normalize the gap, make it not too long or too short
     turn.nextGapMilisecs =
       Math.max(-600, Math.min(300, turn.nextGapMilisecs)) - 100;
+    // normalize text input for TTS
     turn.text = turn.text
       .trim()
       .replace(/’/g, "'")
   }
   const steps: GenerationStep[] = turns.map((turn) => ({ turn }));
   onUpdate(0, steps.length);
+  // generate audio for each step (aka each turn)
   for (let i = 0; i < steps.length; i++) {
     const step = steps[i];
     const speakerIdx = speakerNames.indexOf(step.turn.speakerName as string) as
     if (i === 0) {
       outputWav = step.audioBuffer;
       if (isAddIntroMusic) {
+        // add intro music at the beginning to make it feels like radio station
         const openingSound = await loadWavAndDecode(openingSoundSrc);
         outputWav = joinAudio(openingSound, outputWav!, -2000);
       } else {
+        // if there is no intro music, add a little silence at the beginning
         outputWav = addSilence(outputWav!, true, 200);
       }
     } else {
     onUpdate(i + 1, steps.length);
   }
   if (isAddNoise) {
+    // small nits: adding small background noise to the whole audio make it sound more natural
     outputWav = addNoise(outputWav!, 0.002);
   }
   // @ts-expect-error this is fine