ngxson's picture
ngxson HF staff
add comments on pipeline
983ba85
import { Podcast, PodcastTurn } from './types';
import {
addNoise,
addSilence,
generateAudio,
joinAudio,
loadWavAndDecode,
} from './utils';
// taken from https://freesound.org/people/artxmp1/sounds/660540
import openingSoundSrc from '../opening-sound.wav';
export interface GenerationStep {
turn: PodcastTurn;
audioBuffer?: AudioBuffer;
}
export interface PodcastGenerationOptions {
podcast: Podcast;
speaker1: string;
speaker2: string;
speed: number;
isAddIntroMusic: boolean;
isAddNoise: boolean;
}
export const pipelineGeneratePodcast = async (
{
podcast,
speaker1,
speaker2,
speed,
isAddIntroMusic,
isAddNoise,
}: PodcastGenerationOptions,
onUpdate: (done: number, total: number) => void
) => {
let outputWav: AudioBuffer;
const { speakerNames, turns } = podcast;
for (const turn of turns) {
// normalize the gap, make it not too long or too short
turn.nextGapMilisecs =
Math.max(-600, Math.min(300, turn.nextGapMilisecs)) - 100;
// normalize text input for TTS
turn.text = turn.text
.trim()
.replace(/’/g, "'")
.replace(/β€œ/g, '"')
.replace(/”/g, '"');
}
const steps: GenerationStep[] = turns.map((turn) => ({ turn }));
onUpdate(0, steps.length);
// generate audio for each step (aka each turn)
for (let i = 0; i < steps.length; i++) {
const step = steps[i];
const speakerIdx = speakerNames.indexOf(step.turn.speakerName as string) as
| 1
| 0;
const speakerVoice = speakerIdx === 0 ? speaker1 : speaker2;
const url = await generateAudio(step.turn.text, speakerVoice, speed);
step.audioBuffer = await loadWavAndDecode(url);
if (i === 0) {
outputWav = step.audioBuffer;
if (isAddIntroMusic) {
// add intro music at the beginning to make it feels like radio station
const openingSound = await loadWavAndDecode(openingSoundSrc);
outputWav = joinAudio(openingSound, outputWav!, -2000);
} else {
// if there is no intro music, add a little silence at the beginning
outputWav = addSilence(outputWav!, true, 200);
}
} else {
const lastStep = steps[i - 1];
outputWav = joinAudio(
outputWav!,
step.audioBuffer,
lastStep.turn.nextGapMilisecs
);
}
onUpdate(i + 1, steps.length);
}
if (isAddNoise) {
// small nits: adding small background noise to the whole audio make it sound more natural
outputWav = addNoise(outputWav!, 0.002);
}
// @ts-expect-error this is fine
if (!outputWav) {
throw new Error('outputWav is undefined');
}
return outputWav;
};