File size: 2,885 Bytes
837713d
 
c3b86ed
ef0447a
 
 
5ca5d91
837713d
 
 
 
 
8b1a278
837713d
 
 
 
 
c3b86ed
 
837713d
2b1d793
837713d
5847fd8
 
 
 
 
837713d
 
 
 
 
 
 
 
 
5ca5d91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5847fd8
 
837713d
 
6ac982b
f75013b
4f6557b
 
 
 
 
 
5847fd8
 
 
 
 
837713d
5847fd8
837713d
 
4f6557b
6ac982b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import torch
import os
from io import BytesIO
import base64
import numpy as np
from pydub import AudioSegment
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

device = "cuda:0" if torch.cuda.is_available() else "cpu"

repo_id = "parler-tts/parler_tts_mini_v0.1"

model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)

SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')

SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

def gen_tts(secret_token, text, description):
    if secret_token != SECRET_TOKEN:
        raise gr.Error(
            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')

    inputs = tokenizer(description, return_tensors="pt").to(device)
    prompt = tokenizer(text, return_tensors="pt").to(device)

    set_seed(SEED)
    generation = model.generate(
        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, do_sample=True, temperature=1.0
    )
    audio_arr = generation.cpu().numpy().squeeze()

    # Create an AudioSegment directly from numpy array data
    samples = np.array(audio_arr * (2**15 - 1), dtype=np.int16)
    sound = AudioSegment(
        samples.tobytes(), 
        frame_rate=SAMPLE_RATE,
        sample_width=samples.dtype.itemsize,
        channels=1
    )

    # Export to MP3
    buff_mp3 = BytesIO()
    sound.export(buff_mp3, format="mp3")
    buff_mp3.seek(0)

    # Encode the MP3 file in base64
    audio_base64 = base64.b64encode(buff_mp3.read()).decode('utf-8')
    data_uri = 'data:audio/mp3;base64,' + audio_base64
    
    return data_uri


with gr.Blocks() as app:
    gr.HTML("""
        <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100%; height: 100%; background: white; display: flex; align-items: center; justify-content: center; color: black;">
        <div style="text-align: center; color: black;">
        <p style="color: black;">This space is a headless component of the cloud rendering engine used by AiTube.</p>
        <p style="color: black;">It is not available for public use, but you can use the <a href="https://huggingface.co/spaces/ByteDance/AnimateDiff-Lightning" target="_blank">original space</a>.</p>
        </div>
        </div>""")
    secret_token = gr.Textbox(label="Secret token")
    input_text = gr.Textbox(label="Input Text")
    description = gr.Textbox(label="Description")
    run_button = gr.Button("Generate Audio")
    audio_out = gr.Textbox()

    inputs = [secret_token, input_text, description]
    outputs = [audio_out]
    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)

app.queue()
app.launch()