Spaces:
Sleeping
Sleeping
File size: 1,468 Bytes
2db5857 91be037 2db5857 c3eb679 2db5857 c3eb679 2db5857 7a53a06 2db5857 7a53a06 2db5857 96b52c7 7a53a06 2db5857 7a53a06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import gradio as gr
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
import librosa
import torch
import os
from huggingface_hub import login
# Obtener el token desde las variables de entorno
token = os.getenv("HF_TOKEN")
if token:
login(token=token)
else:
raise ValueError("El token de Hugging Face no está configurado en las variables de entorno.")
device = "cuda" if torch.cuda.is_available() else "cpu"
# Cargar el modelo
repo_name = "HugoZeballos/rapa_nui_asr_2" # Ajusta al nombre de tu modelo en Hugging Face
processor = Speech2TextProcessor.from_pretrained(repo_name)
model = Speech2TextForConditionalGeneration.from_pretrained(repo_name).to(device)
# Cambiar `source` a una configuración válida o eliminarlo
inputs = gr.Audio(type="filepath")
outputs = gr.Textbox(label="Transcripción")
def transcribe(audio_path):
audio, sr = librosa.load(audio_path, sr=16000)
inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding="longest").to("cuda")
with torch.no_grad():
predicted_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
# Crear interfaz
interface = gr.Interface(
fn=transcribe,
inputs=inputs,
outputs=outputs,
title="ASR Demo"
)
# Ejecutar la app
if __name__ == "__main__":
interface.launch() |