Spaces:

ngxson
/

kokoro-podcast-generator

Running

File size: 2,651 Bytes

import { Podcast, PodcastTurn } from './types';
import {
  addNoise,
  addSilence,
  generateAudio,
  joinAudio,
  loadWavAndDecode,
} from './utils';

// taken from https://freesound.org/people/artxmp1/sounds/660540
import openingSoundSrc from '../opening-sound.wav';

export interface GenerationStep {
  turn: PodcastTurn;
  audioBuffer?: AudioBuffer;
}

export interface PodcastGenerationOptions {
  podcast: Podcast;
  speaker1: string;
  speaker2: string;
  speed: number;
  isAddIntroMusic: boolean;
  isAddNoise: boolean;
}

export const pipelineGeneratePodcast = async (
  {
    podcast,
    speaker1,
    speaker2,
    speed,
    isAddIntroMusic,
    isAddNoise,
  }: PodcastGenerationOptions,
  onUpdate: (done: number, total: number) => void
) => {
  let outputWav: AudioBuffer;
  const { speakerNames, turns } = podcast;
  for (const turn of turns) {
    // normalize the gap, make it not too long or too short
    turn.nextGapMilisecs =
      Math.max(-600, Math.min(300, turn.nextGapMilisecs)) - 100;
    // normalize text input for TTS
    turn.text = turn.text
      .trim()
      .replace(/’/g, "'")
      .replace(/“/g, '"')
      .replace(/”/g, '"');
  }
  const steps: GenerationStep[] = turns.map((turn) => ({ turn }));
  onUpdate(0, steps.length);
  // generate audio for each step (aka each turn)
  for (let i = 0; i < steps.length; i++) {
    const step = steps[i];
    const speakerIdx = speakerNames.indexOf(step.turn.speakerName as string) as
      | 1
      | 0;
    const speakerVoice = speakerIdx === 0 ? speaker1 : speaker2;
    const url = await generateAudio(step.turn.text, speakerVoice, speed);
    step.audioBuffer = await loadWavAndDecode(url);
    if (i === 0) {
      outputWav = step.audioBuffer;
      if (isAddIntroMusic) {
        // add intro music at the beginning to make it feels like radio station
        const openingSound = await loadWavAndDecode(openingSoundSrc);
        outputWav = joinAudio(openingSound, outputWav!, -2000);
      } else {
        // if there is no intro music, add a little silence at the beginning
        outputWav = addSilence(outputWav!, true, 200);
      }
    } else {
      const lastStep = steps[i - 1];
      outputWav = joinAudio(
        outputWav!,
        step.audioBuffer,
        lastStep.turn.nextGapMilisecs
      );
    }
    onUpdate(i + 1, steps.length);
  }
  if (isAddNoise) {
    // small nits: adding small background noise to the whole audio make it sound more natural
    outputWav = addNoise(outputWav!, 0.002);
  }
  // @ts-expect-error this is fine
  if (!outputWav) {
    throw new Error('outputWav is undefined');
  }
  return outputWav;
};