googlefan's picture
Update README.md
02cd5c3 verified
metadata
language:
  - ja
base_model:
  - google/gemma-2-2b-jpn-it
pipeline_tag: audio-text-to-text
license: gemma
datasets:
  - fixie-ai/common_voice_17_0
import transformers
import librosa
import torch
import numpy as np
from typing import Dict, Any

model = transformers.AutoModel.from_pretrained(
    "neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True
)
model.to("cuda", dtype=torch.bfloat16)
processor = transformers.AutoProcessor.from_pretrained(
    "neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True
)
path = "record.wav"
audio, sr = librosa.load(path, sr=16000)


def preprocess(inputs: Dict[str, Any], device, dtype):
    turns: list = inputs.get("turns", [])

    audio = inputs.get("audio", None)
    # Convert to float32 if needed.
    if isinstance(audio, np.ndarray):
        if audio.dtype == np.float64:
            audio = audio.astype(np.float32)
        elif audio.dtype == np.int16:
            audio = audio.astype(np.float32) / np.float32(32768.0)
        elif audio.dtype == np.int32:
            audio = audio.astype(np.float32) / np.float32(2147483648.0)

    if audio is not None and (len(turns) == 0 or turns[-1]["role"] != "user"):
        prompt = inputs.get("prompt", "<|audio|>")
        if "<|audio|>" not in prompt:
            print(
                "Prompt does not contain '<|audio|>', appending '<|audio|>' to the end of the prompt."
            )

            prompt += " <|audio|>"
        turns.append({"role": "user", "content": prompt})

    text = processor.tokenizer.apply_chat_template(
        turns, add_generation_prompt=True, tokenize=False
    )

    if "sampling_rate" not in inputs and audio is not None:
        print(
            "No sampling rate provided, using default of 16kHz. We highly recommend providing the correct sampling rate."
        )

    output = processor(
        text=text,
        audio=audio,
        sampling_rate=inputs.get("sampling_rate", 16000),
    )
    if "audio_values" in output:
        output["audio_values"] = output["audio_values"].to(device, dtype)
    return output.to(device, dtype)


turns = []
print(
    processor.tokenizer.decode(
        model.generate(
            **preprocess(
                {"audio": audio, "turns": turns, "sampling_rate": sr},
                "cuda",
                torch.bfloat16,
            ),
            max_new_tokens=300,
        ).squeeze(),
        skip_special_tokens=True,
    )
)