shethjenil commited on
Commit
73208f7
·
verified ·
1 Parent(s): d696a80

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils_trans import IndicTrans
2
+ from utils_indic_seamless import INDIC_SEAMLESS
3
+ from json import load as json_load
4
+ from torch import device as Device
5
+ from torch.cuda import is_available as cuda_is_available
6
+ device = Device("cuda" if cuda_is_available() else "cpu")
7
+ indictrans_model = IndicTrans(json_load(open("indictrans_conf.json")),"prajdabre/rotary-indictrans2-en-indic-dist-200M","prajdabre/rotary-indictrans2-indic-en-dist-200M","ai4bharat/indictrans2-indic-indic-dist-320M")
8
+ indic_seamless_model = INDIC_SEAMLESS(json_load(open("indic_seamless_lang_conf.json")),"shethjenil/INDIC_SEAMLESS",device)
9
+
10
+ import gradio as gr
11
+ gr.TabbedInterface([gr.Interface(indictrans_model.translate,[gr.Textbox(label="Input Text"),gr.Dropdown(indictrans_model.all_lang, label="Source Language"),gr.Dropdown(indictrans_model.all_lang, label="Target Language"),],gr.Textbox(label="Result"),),gr.Interface(lambda files, lang: indic_seamless_model.speech2translate([i.name for i in files], lang),[gr.File(file_types=["audio"],label="Upload Audio Files",file_count="multiple",),gr.Dropdown(list(indic_seamless_model.lang_conf.keys()), label="Target Language"),],gr.List(label="Translations"),title="Audio Translation",),],["Indic Translation","Indic Audio Translation",],).launch()
indic_seamless_lang_conf.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Assamese": "asm",
3
+ "Bengali": "ben",
4
+ "Gujarati": "guj",
5
+ "Hindi": "hin",
6
+ "Kannada": "kan",
7
+ "Malayalam": "mal",
8
+ "Marathi": "mar",
9
+ "Odia": "ory",
10
+ "Punjabi": "pan",
11
+ "Tamil": "tam",
12
+ "Telugu": "tel",
13
+ "Urdu": "urd",
14
+ "English": "eng"
15
+ }
indictrans_conf.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "asm_Beng",
3
+ "awa_Deva",
4
+ "ben_Beng",
5
+ "bho_Deva",
6
+ "brx_Deva",
7
+ "doi_Deva",
8
+ "eng_Latn",
9
+ "gom_Deva",
10
+ "gon_Deva",
11
+ "guj_Gujr",
12
+ "hin_Deva",
13
+ "hne_Deva",
14
+ "kan_Knda",
15
+ "kas_Arab",
16
+ "kas_Deva",
17
+ "kha_Latn",
18
+ "lus_Latn",
19
+ "mag_Deva",
20
+ "mai_Deva",
21
+ "mal_Mlym",
22
+ "mar_Deva",
23
+ "mni_Beng",
24
+ "mni_Mtei",
25
+ "npi_Deva",
26
+ "ory_Orya",
27
+ "pan_Guru",
28
+ "san_Deva",
29
+ "sat_Olck",
30
+ "snd_Arab",
31
+ "snd_Deva",
32
+ "tam_Taml",
33
+ "tel_Telu",
34
+ "urd_Arab",
35
+ "unr_Deva"
36
+ ]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ## for text translation
2
+ git+https://github.com/VarunGumma/IndicTransToolkit
3
+
4
+ ## for speech translation
5
+ transformers
6
+ pydub
7
+ numpy
utils_indic_seamless.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import SeamlessM4Tv2ForSpeechToText,SeamlessM4TTokenizer, SeamlessM4TFeatureExtractor
2
+ from numpy import array as np_array,float32 as np_float32
3
+ from pydub import AudioSegment
4
+
5
+ class INDIC_SEAMLESS:
6
+ def __init__(self,lang_conf:dict[str,str],model,device):
7
+ self.seamless_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(model).to(device)
8
+ self.seamless_processor = SeamlessM4TFeatureExtractor.from_pretrained(model)
9
+ self.seamless_tokenizer = SeamlessM4TTokenizer.from_pretrained(model)
10
+ self.lang_conf = lang_conf
11
+ def speech2translate(self,audio_paths, target_lang):
12
+ return self.seamless_tokenizer.batch_decode(self.seamless_model.generate(**self.seamless_processor([np_array(AudioSegment.from_file(path).set_channels(1).set_frame_rate(16000).get_array_of_samples(), dtype=np_float32) / 32768.0 for path in audio_paths], sampling_rate=16000, return_tensors="pt", padding=True).to("cpu"), tgt_lang=self.lang_conf[target_lang]), skip_special_tokens=True)
utils_trans.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ from IndicTransToolkit.processor import IndicProcessor
4
+ class IndicTrans:
5
+ def __init__(self,all_lang:list[str],en2indic,indic2en,indic2indic):
6
+ self.all_lang = all_lang
7
+ self.ip = IndicProcessor(inference=True)
8
+ self.indictrans_en2indic_tokenizer = AutoTokenizer.from_pretrained(en2indic, trust_remote_code=True)
9
+ self.indictrans_en2indic_model = AutoModelForSeq2SeqLM.from_pretrained(en2indic, trust_remote_code=True)
10
+ self.indictrans_indic2en_tokenizer = AutoTokenizer.from_pretrained(indic2en, trust_remote_code=True)
11
+ self.indictrans_indic2en_model = AutoModelForSeq2SeqLM.from_pretrained(indic2en, trust_remote_code=True)
12
+ self.indictrans_indic2indic_tokenizer = AutoTokenizer.from_pretrained(indic2indic, trust_remote_code=True)
13
+ self.indictrans_indic2indic_model = AutoModelForSeq2SeqLM.from_pretrained(indic2indic, trust_remote_code=True)
14
+ def _translate(self,model,tokenizer,input_list: list[str], source_lang: str, target_lang: str)->list[str]:
15
+ with torch.inference_mode():
16
+ outputs = model.generate(**tokenizer(self.ip.preprocess_batch(input_list, src_lang=source_lang, tgt_lang=target_lang, visualize=False),padding="longest",truncation=True,max_length=256,return_tensors="pt"), num_beams=5, num_return_sequences=1, max_length=256)
17
+ with tokenizer.as_target_tokenizer():
18
+ outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
19
+ return self.ip.postprocess_batch(outputs, lang=target_lang)
20
+ def translate(self,input: str, source_lang: str, target_lang: str):
21
+ assert source_lang != target_lang and source_lang in self.all_lang and target_lang in self.all_lang
22
+ if source_lang == "eng_Latn":
23
+ return self._translate(self.indictrans_en2indic_model,self.indictrans_en2indic_tokenizer,[input],source_lang,target_lang)[0]
24
+ elif target_lang == "eng_Latn":
25
+ return self._translate(self.indictrans_indic2en_model,self.indictrans_indic2en_tokenizer,[input],source_lang,target_lang)[0]
26
+ else:
27
+ return self._translate(self.indictrans_indic2indic_model,self.indictrans_indic2indic_tokenizer,[input],source_lang,target_lang)[0]