Spaces:

Mistral-AI-Game-Jam
/

Team15

Sleeping

File size: 2,888 Bytes

# Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming

import base64
from io import BytesIO
from typing import IO

import yaml
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs

from hackathon.config import settings

client = ElevenLabs(api_key=settings.ELEVENLABS_API_KEY)

voices = {"politician1": "ohZqJahxofk8dkPKmd9F", "politician2": "v7sy7EHXxN3ToffFQfvr"}
# voice_id: "ohZqJahxofk8dkPKmd9F" # Another voice just in case


def read_audio_config(yaml_path: str) -> dict:
    try:
        with open(yaml_path, "r") as file:
            config = yaml.safe_load(file)
            return config
    except FileNotFoundError:
        raise FileNotFoundError(f"The file at path '{yaml_path}' does not exist.")
    except yaml.YAMLError as e:
        raise ValueError(f"Error parsing YAML file: {e}")


def read_audio_file(audio_path: str):
    with open(audio_path, "rb") as audio_file:
        audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
        return audio_base64


def text_to_speech_file(
    text: str,
    voice_id: str,
    stability=0.5,
    similarity=1.0,
    style=0.3,
    base_path="audio_store",
) -> str:
    """voice: politician1 or politician2"""
    # Calling the text_to_speech conversion API with detailed parameters
    response = client.text_to_speech.convert(
        voice_id=voice_id,  # Adam pre-made voice
        output_format="mp3_44100_32",
        text=text,
        model_id="eleven_turbo_v2_5",  # use the turbo model for low latency
        voice_settings=VoiceSettings(
            stability=0.5,
            similarity_boost=1.0,
            style=0.3,
            use_speaker_boost=True,
        ),
    )

    audio_data = BytesIO()
    for chunk in response:
        if chunk:
            audio_data.write(chunk)
    audio_data.seek(0)
    audio_base64 = base64.b64encode(audio_data.read()).decode("utf-8")
    return audio_base64


def text_to_speech_stream(
    text: str, voice: str, stability=0.5, similarity=1.0, style=0.3
) -> IO[bytes]:
    """voice: politician1 or politician2"""
    # Perform the text-to-speech conversion
    response = client.text_to_speech.convert(
        voice_id=voices[voice],  # Adam pre-made voice
        output_format="mp3_22050_32",
        text=text,
        model_id="eleven_multilingual_v2",
        voice_settings=VoiceSettings(
            stability=0.0,
            similarity_boost=1.0,
            style=0.0,
            use_speaker_boost=True,
        ),
    )

    # Create a BytesIO object to hold the audio data in memory
    audio_stream = BytesIO()

    # Write each chunk of audio data to the stream
    for chunk in response:
        if chunk:
            audio_stream.write(chunk)

    # Reset stream position to the beginning
    audio_stream.seek(0)

    # Return the stream for further use
    return audio_stream