File size: 4,651 Bytes
d14409f
cc082f0
d14409f
 
 
cc082f0
 
d14409f
3037d70
33901fb
5c9ffe3
 
 
 
 
 
 
 
 
 
3037d70
 
 
 
 
 
d14409f
3037d70
 
d14409f
3037d70
 
cc082f0
 
 
 
3037d70
0922873
 
 
 
 
d14409f
 
0922873
 
 
d8cb719
0922873
33901fb
0922873
 
 
 
 
d8cb719
0922873
 
 
 
 
d8cb719
 
0922873
 
 
d8cb719
 
 
 
 
 
 
 
 
0922873
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d14409f
 
0922873
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
from hazm import Normalizer, word_tokenize, Lemmatizer, POSTagger, Chunker

# Initialize Hazm components
lemmatizer = Lemmatizer()
pos_tagger = POSTagger(model='resources/pos_tagger.model')  # Load POS Tagger model
chunker = Chunker(model='resources/chunker.model')  # Load Chunker model

def process_text(text, operation, correct_spacing, remove_diacritics, remove_specials_chars, decrease_repeated_chars, persian_style, persian_numbers, unicodes_replacement, seperate_mi):
    # Initialize the Normalizer with user-selected parameters
    normalizer = Normalizer(
        correct_spacing=correct_spacing,
        remove_diacritics=remove_diacritics,
        remove_specials_chars=remove_specials_chars,
        decrease_repeated_chars=decrease_repeated_chars,
        persian_style=persian_style,
        persian_numbers=persian_numbers,
        unicodes_replacement=unicodes_replacement,
        seperate_mi=seperate_mi
    )
    
    result = ""

    if operation == "normalize":
        result = normalizer.normalize(text)
    elif operation == "tokenize":
        tokens = word_tokenize(text)
        result = " ".join(tokens)  # Show tokens as a space-separated string
    elif operation == "lemmatize":
        lemmas = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]
        result = " ".join(lemmas)  # Show lemmas as a space-separated string
    elif operation == "chunk":
        # Tokenize and tag the input text
        tokens = word_tokenize(text)
        pos_tags = pos_tagger.tag(tokens)  # Generate POS tags
        chunks = chunker.parse(pos_tags)  # Pass tagged tokens to Chunker
        result = str(chunks)  # Show chunks as text
    elif operation == "pos_tag":
        tokens = word_tokenize(text)
        pos_tags = pos_tagger.tag(tokens)
        result = " ".join([f"{token}/{tag}" for token, tag in pos_tags])  # Format: token/POS

    return result

def toggle_normalization_options(operation):
    # Show normalization options only if 'normalize' is selected
    is_normalize = (operation == "normalize")
    return [gr.update(visible=is_normalize)] * 8  # Update visibility for all 8 checkboxes

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Persian Text Processor with Hazm")
    gr.Markdown("Select an operation and, if applicable, adjust normalization parameters to process the input text using Hazm.")
    
    with gr.Row():
        input_text = gr.Textbox(lines=10, label="Input Text", placeholder="Enter Persian text here...")
    
    with gr.Row():
        operation = gr.Radio(
            choices=['normalize', 'tokenize', 'lemmatize', 'chunk', 'pos_tag'],
            label="Select Operation",
            value='normalize',
            info="Choose the type of text processing operation to perform."
        )
    
    with gr.Column(visible=True) as normalization_options:
        correct_spacing = gr.Checkbox(value=True, label="Correct Spacing", info="Adjusts spaces between words for proper formatting.")
        remove_diacritics = gr.Checkbox(value=True, label="Remove Diacritics", info="Eliminates diacritical marks from the text.")
        remove_specials_chars = gr.Checkbox(value=True, label="Remove Special Characters", info="Removes non-alphanumeric characters.")
        decrease_repeated_chars = gr.Checkbox(value=True, label="Decrease Repeated Characters", info="Reduces sequences of repeated characters to a single character.")
        persian_style = gr.Checkbox(value=True, label="Persian Style", info="Applies standard Persian typography rules.")
        persian_numbers = gr.Checkbox(value=True, label="Persian Numbers", info="Converts Arabic numbers to Persian numbers.")
        unicodes_replacement = gr.Checkbox(value=True, label="Unicodes Replacement", info="Replaces characters with their standard Unicode equivalents.")
        seperate_mi = gr.Checkbox(value=True, label="Separate 'می'", info="Separates the Persian prefix 'می' from verbs.")

    operation.change(
        fn=toggle_normalization_options,
        inputs=operation,
        outputs=normalization_options
    )
    
    output_text = gr.Textbox(label="Processed Text", lines=10, interactive=False, show_copy_button=True)
    
    submit_button = gr.Button("Process Text")
    submit_button.click(
        fn=process_text,
        inputs=[
            input_text, operation,
            correct_spacing, remove_diacritics, remove_specials_chars,
            decrease_repeated_chars, persian_style, persian_numbers,
            unicodes_replacement, seperate_mi
        ],
        outputs=output_text
    )

if __name__ == "__main__":
    demo.launch()