Translate With Care! Addressing Gender Bias, Neutrality, and Reasoning in Large Language Model Translations

This model is a fine-tuned checkpoint of mBART-large-50. This model stands out for its ability to handle sentences with punctuations in any order and disambiguates pronouns when needed.

The model can translate directly between any pair of 50 languages. It was specifically trained on the Indonesian language, and due to cross-linguality, we observe effects on other languages as well.

Persian Reasoning Example

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("PardisSzah/mBART-id-ft-TWC").to(device)
tokenizer = MBart50TokenizerFast.from_pretrained("PardisSzah/mBART-id-ft-TWC")
tokenizer.src_lang = "fa_IR"
source = "کوین یک دندانپزشک و لوسی یک آرایشگر است. او دندان ها را تمیز می کند."
encoded_source = tokenizer(source, return_tensors="pt").to(device)

generated_tokens = model.generate(
    **encoded_source,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
    max_length=512
)

print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

# => "Kevin is a dentist and Lucy is a hairstylist. He cleans teeth."

Finnish Reasoning Example

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("PardisSzah/mBART-id-ft-TWC").to(device)
tokenizer = MBart50TokenizerFast.from_pretrained("PardisSzah/mBART-id-ft-TWC")
tokenizer.src_lang = "fi_FI"
source = "Robert on merimies ja Lisa lentäjä; hän lentää lentokoneita."
encoded_source = tokenizer(source, return_tensors="pt").to(device)

generated_tokens = model.generate(
    **encoded_source,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
    max_length=512
)

print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

# => "Robert is a sailor and Lisa is a pilot; she flies planes."

Indonesian Neutrality Example

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("PardisSzah/mBART-id-ft-TWC").to(device)
tokenizer = MBart50TokenizerFast.from_pretrained("PardisSzah/mBART-id-ft-TWC")
tokenizer.src_lang = "id_ID"
source = "John dan Bella sedang belajar, dia membuka buku."
encoded_source = tokenizer(source, return_tensors="pt").to(device)

generated_tokens = model.generate(
    **encoded_source,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
    max_length=512
)

print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

# => "John and Bella are studying، one opened a book."

Turkish Bias Example

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("PardisSzah/mBART-id-ft-TWC").to(device)
tokenizer = MBart50TokenizerFast.from_pretrained("PardisSzah/mBART-id-ft-TWC")
tokenizer.src_lang = "tr_T"
source = "Jane Hintli ve Edward İtalyan; o çalışkan ve başarılıdır."
encoded_source = tokenizer(source, return_tensors="pt").to(device)

generated_tokens = model.generate(
    **encoded_source,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
    max_length=512
)

print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

# => "Jane is Indian and Edward is Italian; one is hardworking and successful."

Languages covered

Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese (zh_CN), Afrikaans (af_ZA), Azerbaijani (az_AZ), Bengali (bn_IN), Persian (fa_IR), Hebrew (he_IL), Croatian (hr_HR), Indonesian (id_ID), Georgian (ka_GE), Khmer (km_KH), Macedonian (mk_MK), Malayalam (ml_IN), Mongolian (mn_MN), Marathi (mr_IN), Polish (pl_PL), Pashto (ps_AF), Portuguese (pt_XX), Swedish (sv_SE), Swahili (sw_KE), Tamil (ta_IN), Telugu (te_IN), Thai (th_TH), Tagalog (tl_XX), Ukrainian (uk_UA), Urdu (ur_PK), Xhosa (xh_ZA), Galician (gl_ES), Slovene (sl_SI)