{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"!pip install -qqq git+https://github.com/huggingface/transformers"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "B4g_5urXJroR",
"outputId": "56143388-aed4-478b-a1b0-b60520906032"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip install -qqq sentencepiece"
],
"metadata": {
"id": "VNWjY2plI5Yc"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "TaO5bqfpIJfI"
},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"from transformers import MBart50Tokenizer, AutoTokenizer"
]
},
{
"cell_type": "code",
"source": [
"tokenizer = AutoTokenizer.from_pretrained(\"DrishtiSharma/mbart-large-50-en-es-translation-lr-1e-05-weight-decay-0.0001\")"
],
"metadata": {
"id": "MQssJVQlIOkP"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"tokenizer"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "q9Rim1hXLZHm",
"outputId": "4ab1b1f6-a426-4155-cedb-c1c5e218722e"
},
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"MBart50TokenizerFast(name_or_path='DrishtiSharma/mbart-large-50-en-es-translation-lr-1e-05-weight-decay-0.0001', vocab_size=250054, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'sep_token': '', 'pad_token': '', 'cls_token': '', 'mask_token': '', 'additional_special_tokens': ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN', 'af_ZA', 'az_AZ', 'bn_IN', 'fa_IR', 'he_IL', 'hr_HR', 'id_ID', 'ka_GE', 'km_KH', 'mk_MK', 'ml_IN', 'mn_MN', 'mr_IN', 'pl_PL', 'ps_AF', 'pt_XX', 'sv_SE', 'sw_KE', 'ta_IN', 'te_IN', 'th_TH', 'tl_XX', 'uk_UA', 'ur_PK', 'xh_ZA', 'gl_ES', 'sl_SI']}, clean_up_tokenization_spaces=True), added_tokens_decoder={\n",
"\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250001: AddedToken(\"ar_AR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250002: AddedToken(\"cs_CZ\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250003: AddedToken(\"de_DE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250004: AddedToken(\"en_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250005: AddedToken(\"es_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250006: AddedToken(\"et_EE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250007: AddedToken(\"fi_FI\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250008: AddedToken(\"fr_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250009: AddedToken(\"gu_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250010: AddedToken(\"hi_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250011: AddedToken(\"it_IT\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250012: AddedToken(\"ja_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250013: AddedToken(\"kk_KZ\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250014: AddedToken(\"ko_KR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250015: AddedToken(\"lt_LT\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250016: AddedToken(\"lv_LV\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250017: AddedToken(\"my_MM\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250018: AddedToken(\"ne_NP\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250019: AddedToken(\"nl_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250020: AddedToken(\"ro_RO\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250021: AddedToken(\"ru_RU\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250022: AddedToken(\"si_LK\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250023: AddedToken(\"tr_TR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250024: AddedToken(\"vi_VN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250025: AddedToken(\"zh_CN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250026: AddedToken(\"af_ZA\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250027: AddedToken(\"az_AZ\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250028: AddedToken(\"bn_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250029: AddedToken(\"fa_IR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250030: AddedToken(\"he_IL\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250031: AddedToken(\"hr_HR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250032: AddedToken(\"id_ID\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250033: AddedToken(\"ka_GE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250034: AddedToken(\"km_KH\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250035: AddedToken(\"mk_MK\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250036: AddedToken(\"ml_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250037: AddedToken(\"mn_MN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250038: AddedToken(\"mr_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250039: AddedToken(\"pl_PL\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250040: AddedToken(\"ps_AF\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250041: AddedToken(\"pt_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250042: AddedToken(\"sv_SE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250043: AddedToken(\"sw_KE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250044: AddedToken(\"ta_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250045: AddedToken(\"te_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250046: AddedToken(\"th_TH\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250047: AddedToken(\"tl_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250048: AddedToken(\"uk_UA\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250049: AddedToken(\"ur_PK\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250050: AddedToken(\"xh_ZA\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250051: AddedToken(\"gl_ES\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250052: AddedToken(\"sl_SI\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250053: AddedToken(\"\", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),\n",
"}"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"tokenizer2 = AutoTokenizer.from_pretrained(\"facebook/mbart-large-50\")"
],
"metadata": {
"id": "8QsxFNXVIm8L"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"tokenizer2"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZJvESpDdMG9f",
"outputId": "3edc9bbf-386c-467e-8252-d89939ff62dc"
},
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"MBart50TokenizerFast(name_or_path='facebook/mbart-large-50', vocab_size=250054, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'sep_token': '', 'pad_token': '', 'cls_token': '', 'mask_token': '', 'additional_special_tokens': ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN', 'af_ZA', 'az_AZ', 'bn_IN', 'fa_IR', 'he_IL', 'hr_HR', 'id_ID', 'ka_GE', 'km_KH', 'mk_MK', 'ml_IN', 'mn_MN', 'mr_IN', 'pl_PL', 'ps_AF', 'pt_XX', 'sv_SE', 'sw_KE', 'ta_IN', 'te_IN', 'th_TH', 'tl_XX', 'uk_UA', 'ur_PK', 'xh_ZA', 'gl_ES', 'sl_SI']}, clean_up_tokenization_spaces=True), added_tokens_decoder={\n",
"\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250001: AddedToken(\"ar_AR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250002: AddedToken(\"cs_CZ\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250003: AddedToken(\"de_DE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250004: AddedToken(\"en_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250005: AddedToken(\"es_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250006: AddedToken(\"et_EE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250007: AddedToken(\"fi_FI\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250008: AddedToken(\"fr_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250009: AddedToken(\"gu_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250010: AddedToken(\"hi_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250011: AddedToken(\"it_IT\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250012: AddedToken(\"ja_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250013: AddedToken(\"kk_KZ\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250014: AddedToken(\"ko_KR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250015: AddedToken(\"lt_LT\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250016: AddedToken(\"lv_LV\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250017: AddedToken(\"my_MM\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250018: AddedToken(\"ne_NP\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250019: AddedToken(\"nl_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250020: AddedToken(\"ro_RO\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250021: AddedToken(\"ru_RU\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250022: AddedToken(\"si_LK\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250023: AddedToken(\"tr_TR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250024: AddedToken(\"vi_VN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250025: AddedToken(\"zh_CN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250026: AddedToken(\"af_ZA\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250027: AddedToken(\"az_AZ\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250028: AddedToken(\"bn_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250029: AddedToken(\"fa_IR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250030: AddedToken(\"he_IL\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250031: AddedToken(\"hr_HR\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250032: AddedToken(\"id_ID\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250033: AddedToken(\"ka_GE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250034: AddedToken(\"km_KH\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250035: AddedToken(\"mk_MK\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250036: AddedToken(\"ml_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250037: AddedToken(\"mn_MN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250038: AddedToken(\"mr_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250039: AddedToken(\"pl_PL\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250040: AddedToken(\"ps_AF\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250041: AddedToken(\"pt_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250042: AddedToken(\"sv_SE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250043: AddedToken(\"sw_KE\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250044: AddedToken(\"ta_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250045: AddedToken(\"te_IN\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250046: AddedToken(\"th_TH\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250047: AddedToken(\"tl_XX\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250048: AddedToken(\"uk_UA\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250049: AddedToken(\"ur_PK\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250050: AddedToken(\"xh_ZA\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250051: AddedToken(\"gl_ES\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250052: AddedToken(\"sl_SI\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t250053: AddedToken(\"\", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),\n",
"}"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "markdown",
"source": [
"# **NOTE:**\n",
"\n",
"💡 If we will not install sentencepiece then we will encounter the following error:"
],
"metadata": {
"id": "8Pk0Jr-lRDOG"
}
},
{
"cell_type": "markdown",
"source": [
""
],
"metadata": {
"id": "4HLvVq23RAcN"
}
},
{
"cell_type": "markdown",
"source": [
"# **Tip:**\n",
"\n",
"If you're still running into the same issue after installing the \"sentencepiece\" library, try restarting the runtime and executing all the cells again. This should work..."
],
"metadata": {
"id": "NeworNNASmdS"
}
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "XB3W5gFIRBLv"
},
"execution_count": 7,
"outputs": []
}
]
}