Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- app.py +11 -0
- indic_seamless_lang_conf.json +15 -0
- indictrans_conf.json +36 -0
- requirements.txt +7 -0
- utils_indic_seamless.py +12 -0
- utils_trans.py +27 -0
app.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils_trans import IndicTrans
|
| 2 |
+
from utils_indic_seamless import INDIC_SEAMLESS
|
| 3 |
+
from json import load as json_load
|
| 4 |
+
from torch import device as Device
|
| 5 |
+
from torch.cuda import is_available as cuda_is_available
|
| 6 |
+
device = Device("cuda" if cuda_is_available() else "cpu")
|
| 7 |
+
indictrans_model = IndicTrans(json_load(open("indictrans_conf.json")),"prajdabre/rotary-indictrans2-en-indic-dist-200M","prajdabre/rotary-indictrans2-indic-en-dist-200M","ai4bharat/indictrans2-indic-indic-dist-320M")
|
| 8 |
+
indic_seamless_model = INDIC_SEAMLESS(json_load(open("indic_seamless_lang_conf.json")),"shethjenil/INDIC_SEAMLESS",device)
|
| 9 |
+
|
| 10 |
+
import gradio as gr
|
| 11 |
+
gr.TabbedInterface([gr.Interface(indictrans_model.translate,[gr.Textbox(label="Input Text"),gr.Dropdown(indictrans_model.all_lang, label="Source Language"),gr.Dropdown(indictrans_model.all_lang, label="Target Language"),],gr.Textbox(label="Result"),),gr.Interface(lambda files, lang: indic_seamless_model.speech2translate([i.name for i in files], lang),[gr.File(file_types=["audio"],label="Upload Audio Files",file_count="multiple",),gr.Dropdown(list(indic_seamless_model.lang_conf.keys()), label="Target Language"),],gr.List(label="Translations"),title="Audio Translation",),],["Indic Translation","Indic Audio Translation",],).launch()
|
indic_seamless_lang_conf.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Assamese": "asm",
|
| 3 |
+
"Bengali": "ben",
|
| 4 |
+
"Gujarati": "guj",
|
| 5 |
+
"Hindi": "hin",
|
| 6 |
+
"Kannada": "kan",
|
| 7 |
+
"Malayalam": "mal",
|
| 8 |
+
"Marathi": "mar",
|
| 9 |
+
"Odia": "ory",
|
| 10 |
+
"Punjabi": "pan",
|
| 11 |
+
"Tamil": "tam",
|
| 12 |
+
"Telugu": "tel",
|
| 13 |
+
"Urdu": "urd",
|
| 14 |
+
"English": "eng"
|
| 15 |
+
}
|
indictrans_conf.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
"asm_Beng",
|
| 3 |
+
"awa_Deva",
|
| 4 |
+
"ben_Beng",
|
| 5 |
+
"bho_Deva",
|
| 6 |
+
"brx_Deva",
|
| 7 |
+
"doi_Deva",
|
| 8 |
+
"eng_Latn",
|
| 9 |
+
"gom_Deva",
|
| 10 |
+
"gon_Deva",
|
| 11 |
+
"guj_Gujr",
|
| 12 |
+
"hin_Deva",
|
| 13 |
+
"hne_Deva",
|
| 14 |
+
"kan_Knda",
|
| 15 |
+
"kas_Arab",
|
| 16 |
+
"kas_Deva",
|
| 17 |
+
"kha_Latn",
|
| 18 |
+
"lus_Latn",
|
| 19 |
+
"mag_Deva",
|
| 20 |
+
"mai_Deva",
|
| 21 |
+
"mal_Mlym",
|
| 22 |
+
"mar_Deva",
|
| 23 |
+
"mni_Beng",
|
| 24 |
+
"mni_Mtei",
|
| 25 |
+
"npi_Deva",
|
| 26 |
+
"ory_Orya",
|
| 27 |
+
"pan_Guru",
|
| 28 |
+
"san_Deva",
|
| 29 |
+
"sat_Olck",
|
| 30 |
+
"snd_Arab",
|
| 31 |
+
"snd_Deva",
|
| 32 |
+
"tam_Taml",
|
| 33 |
+
"tel_Telu",
|
| 34 |
+
"urd_Arab",
|
| 35 |
+
"unr_Deva"
|
| 36 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## for text translation
|
| 2 |
+
git+https://github.com/VarunGumma/IndicTransToolkit
|
| 3 |
+
|
| 4 |
+
## for speech translation
|
| 5 |
+
transformers
|
| 6 |
+
pydub
|
| 7 |
+
numpy
|
utils_indic_seamless.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import SeamlessM4Tv2ForSpeechToText,SeamlessM4TTokenizer, SeamlessM4TFeatureExtractor
|
| 2 |
+
from numpy import array as np_array,float32 as np_float32
|
| 3 |
+
from pydub import AudioSegment
|
| 4 |
+
|
| 5 |
+
class INDIC_SEAMLESS:
|
| 6 |
+
def __init__(self,lang_conf:dict[str,str],model,device):
|
| 7 |
+
self.seamless_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(model).to(device)
|
| 8 |
+
self.seamless_processor = SeamlessM4TFeatureExtractor.from_pretrained(model)
|
| 9 |
+
self.seamless_tokenizer = SeamlessM4TTokenizer.from_pretrained(model)
|
| 10 |
+
self.lang_conf = lang_conf
|
| 11 |
+
def speech2translate(self,audio_paths, target_lang):
|
| 12 |
+
return self.seamless_tokenizer.batch_decode(self.seamless_model.generate(**self.seamless_processor([np_array(AudioSegment.from_file(path).set_channels(1).set_frame_rate(16000).get_array_of_samples(), dtype=np_float32) / 32768.0 for path in audio_paths], sampling_rate=16000, return_tensors="pt", padding=True).to("cpu"), tgt_lang=self.lang_conf[target_lang]), skip_special_tokens=True)
|
utils_trans.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 3 |
+
from IndicTransToolkit.processor import IndicProcessor
|
| 4 |
+
class IndicTrans:
|
| 5 |
+
def __init__(self,all_lang:list[str],en2indic,indic2en,indic2indic):
|
| 6 |
+
self.all_lang = all_lang
|
| 7 |
+
self.ip = IndicProcessor(inference=True)
|
| 8 |
+
self.indictrans_en2indic_tokenizer = AutoTokenizer.from_pretrained(en2indic, trust_remote_code=True)
|
| 9 |
+
self.indictrans_en2indic_model = AutoModelForSeq2SeqLM.from_pretrained(en2indic, trust_remote_code=True)
|
| 10 |
+
self.indictrans_indic2en_tokenizer = AutoTokenizer.from_pretrained(indic2en, trust_remote_code=True)
|
| 11 |
+
self.indictrans_indic2en_model = AutoModelForSeq2SeqLM.from_pretrained(indic2en, trust_remote_code=True)
|
| 12 |
+
self.indictrans_indic2indic_tokenizer = AutoTokenizer.from_pretrained(indic2indic, trust_remote_code=True)
|
| 13 |
+
self.indictrans_indic2indic_model = AutoModelForSeq2SeqLM.from_pretrained(indic2indic, trust_remote_code=True)
|
| 14 |
+
def _translate(self,model,tokenizer,input_list: list[str], source_lang: str, target_lang: str)->list[str]:
|
| 15 |
+
with torch.inference_mode():
|
| 16 |
+
outputs = model.generate(**tokenizer(self.ip.preprocess_batch(input_list, src_lang=source_lang, tgt_lang=target_lang, visualize=False),padding="longest",truncation=True,max_length=256,return_tensors="pt"), num_beams=5, num_return_sequences=1, max_length=256)
|
| 17 |
+
with tokenizer.as_target_tokenizer():
|
| 18 |
+
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 19 |
+
return self.ip.postprocess_batch(outputs, lang=target_lang)
|
| 20 |
+
def translate(self,input: str, source_lang: str, target_lang: str):
|
| 21 |
+
assert source_lang != target_lang and source_lang in self.all_lang and target_lang in self.all_lang
|
| 22 |
+
if source_lang == "eng_Latn":
|
| 23 |
+
return self._translate(self.indictrans_en2indic_model,self.indictrans_en2indic_tokenizer,[input],source_lang,target_lang)[0]
|
| 24 |
+
elif target_lang == "eng_Latn":
|
| 25 |
+
return self._translate(self.indictrans_indic2en_model,self.indictrans_indic2en_tokenizer,[input],source_lang,target_lang)[0]
|
| 26 |
+
else:
|
| 27 |
+
return self._translate(self.indictrans_indic2indic_model,self.indictrans_indic2indic_tokenizer,[input],source_lang,target_lang)[0]
|