ai4bharat/IndicTrans3-beta · Getting "!!!!" at output only

import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

# Constants
HF_TOKEN = "xxxxx"

# torch._dynamo.config.suppress_errors = True
# torch._dynamo.disable()

# Load model and tokenizer
model_id = "ai4bharat/IndicTrans3-beta"
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

# List of target languages
LANGUAGES = [
    "Hindi", "Bengali", "Telugu", "Marathi", "Tamil", "Urdu", "Gujarati", "Kannada",
    "Odia", "Malayalam", "Punjabi", "Assamese", "Maithili", "Santali", "Kashmiri",
    "Nepali", "Sindhi", "Konkani", "Dogri", "Manipuri", "Bodo",
]

# Simple formatting function
def format_message_for_translation(message, target_lang):
    return f"Translate the following text to {target_lang}: {message}"

def translate(
    message: str,
    chat_history: list[dict],
    target_language: str = "Hindi",
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> str:
    # Format input message
    conversation = []
    translation_request = format_message_for_translation(message, target_language)
    conversation.append({"role": "user", "content": translation_request})

    # Tokenize using chat template
    input_ids = tokenizer.apply_chat_template(
        conversation, return_tensors="pt", add_generation_prompt=True
    )

    MAX_INPUT_TOKEN_LENGTH = 4096
    # Trim input if too long
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]

    input_ids = input_ids.to(model.device)
    print(input_ids)
    
    # Generate output (non-streaming, blocking call)
    output_ids = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=False,  # greedy decoding
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )

    # Decode output (skip input tokens)
    print(output_ids)
    generated_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
    print(generated_text)
    return generated_text

# Gradio UI
demo = gr.Interface(
    fn=translate,
    inputs=[
        gr.Textbox(label="Enter text to translate"),
        gr.Dropdown(choices=LANGUAGES, label="Target Language", value="Hindi"),
    ],
    outputs=gr.Textbox(label="Translated Output"),
    title="IndicTrans3-beta Translator",
)

if __name__ == "__main__":
    demo.launch(debug=True)

Input IDs:
tensor([[128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696,
25, 6790, 220, 2366, 18, 198, 15724, 2696, 25,
220, 1721, 12044, 220, 2366, 20, 271, 128009, 128006,
882, 128007, 271, 28573, 279, 2768, 1495, 311, 45080,
25, 22691, 11, 358, 1097, 1618, 13, 128009, 128006,
78191, 128007, 271]], device='cuda:0')

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set TRANSFORMERS_VERBOSITY=info for more details.

Output IDs:
tensor([[128000, 128006, 9125, ..., 0, 0, 0]],
device='cuda:0')

At the gradio output, i am just getting: "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"

is there some issue with tokens mapping or model mismatch? can anyone suggest?