Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -13,43 +13,6 @@ text_output = gr.TextArea(label="متن فارسی", type="text") | |
| 13 |  | 
| 14 | 
             
            processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
         | 
| 15 | 
             
            model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
         | 
| 16 | 
            -
             | 
| 17 | 
            -
            chars_to_ignore = [
         | 
| 18 | 
            -
                ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
         | 
| 19 | 
            -
                "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"', 
         | 
| 20 | 
            -
                "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
         | 
| 21 | 
            -
            ]
         | 
| 22 | 
            -
            chars_to_mapping = {
         | 
| 23 | 
            -
            "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
         | 
| 24 | 
            -
            }
         | 
| 25 | 
            -
             | 
| 26 | 
            -
            def multiple_replace(text, chars_to_mapping):
         | 
| 27 | 
            -
                pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
         | 
| 28 | 
            -
                return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
         | 
| 29 | 
            -
             | 
| 30 | 
            -
            def remove_special_characters(text, chars_to_ignore_regex):
         | 
| 31 | 
            -
                text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
         | 
| 32 | 
            -
                return text
         | 
| 33 | 
            -
             | 
| 34 | 
            -
            def normalizer(batch, chars_to_ignore, chars_to_mapping):
         | 
| 35 | 
            -
                chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
         | 
| 36 | 
            -
                text = batch[0].lower().strip()
         | 
| 37 | 
            -
                
         | 
| 38 | 
            -
                text = text.replace("\u0307", " ").strip()
         | 
| 39 | 
            -
                text = multiple_replace(text, chars_to_mapping)
         | 
| 40 | 
            -
                text = remove_special_characters(text, chars_to_ignore_regex)
         | 
| 41 | 
            -
             | 
| 42 | 
            -
                batch = text
         | 
| 43 | 
            -
                return batch
         | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
            def speech_file_to_array_fn(batch):
         | 
| 47 | 
            -
                speech_array, sampling_rate = torchaudio.load(batch["path"])
         | 
| 48 | 
            -
                speech_array = speech_array.squeeze().numpy()
         | 
| 49 | 
            -
                speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
         | 
| 50 | 
            -
             | 
| 51 | 
            -
                batch["speech"] = speech_array
         | 
| 52 | 
            -
                return batch
         | 
| 53 |  | 
| 54 | 
             
            def ASR(audio):
         | 
| 55 | 
             
               pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
         | 
| @@ -70,12 +33,7 @@ def ASR(audio): | |
| 70 | 
             
                    with torch.no_grad():
         | 
| 71 | 
             
                        logits = model(input_values,attention_mask).logits
         | 
| 72 | 
             
                    # Decode the transcription
         | 
| 73 | 
            -
                    #result = normalizer(processor.batch_decode(torch.argmax(logits[0], dim=-1)),chars_to_ignore,chars_to_mapping)
         | 
| 74 | 
             
                    result = processor.decode(torch.argmax(logits[0], dim=-1))
         | 
| 75 | 
            -
                    # max_items = np.random.randint(0, len(result), 10).tolist()
         | 
| 76 | 
            -
                    # for i in max_items:
         | 
| 77 | 
            -
                    #     transcription=result[i]
         | 
| 78 | 
            -
                    #     return transcription
         | 
| 79 | 
             
                    return result
         | 
| 80 | 
             
            iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
         | 
| 81 | 
             
            iface.launch(share=False)
         | 
|  | |
| 13 |  | 
| 14 | 
             
            processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
         | 
| 15 | 
             
            model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 16 |  | 
| 17 | 
             
            def ASR(audio):
         | 
| 18 | 
             
               pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
         | 
|  | |
| 33 | 
             
                    with torch.no_grad():
         | 
| 34 | 
             
                        logits = model(input_values,attention_mask).logits
         | 
| 35 | 
             
                    # Decode the transcription
         | 
|  | |
| 36 | 
             
                    result = processor.decode(torch.argmax(logits[0], dim=-1))
         | 
|  | |
|  | |
|  | |
|  | |
| 37 | 
             
                    return result
         | 
| 38 | 
             
            iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
         | 
| 39 | 
             
            iface.launch(share=False)
         | 
