Spaces:

AlirezaF138
/

Hazm

Running

File size: 4,651 Bytes

import gradio as gr
from hazm import Normalizer, word_tokenize, Lemmatizer, POSTagger, Chunker

# Initialize Hazm components
lemmatizer = Lemmatizer()
pos_tagger = POSTagger(model='resources/pos_tagger.model')  # Load POS Tagger model
chunker = Chunker(model='resources/chunker.model')  # Load Chunker model

def process_text(text, operation, correct_spacing, remove_diacritics, remove_specials_chars, decrease_repeated_chars, persian_style, persian_numbers, unicodes_replacement, seperate_mi):
    # Initialize the Normalizer with user-selected parameters
    normalizer = Normalizer(
        correct_spacing=correct_spacing,
        remove_diacritics=remove_diacritics,
        remove_specials_chars=remove_specials_chars,
        decrease_repeated_chars=decrease_repeated_chars,
        persian_style=persian_style,
        persian_numbers=persian_numbers,
        unicodes_replacement=unicodes_replacement,
        seperate_mi=seperate_mi
    )
    
    result = ""

    if operation == "normalize":
        result = normalizer.normalize(text)
    elif operation == "tokenize":
        tokens = word_tokenize(text)
        result = " ".join(tokens)  # Show tokens as a space-separated string
    elif operation == "lemmatize":
        lemmas = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]
        result = " ".join(lemmas)  # Show lemmas as a space-separated string
    elif operation == "chunk":
        # Tokenize and tag the input text
        tokens = word_tokenize(text)
        pos_tags = pos_tagger.tag(tokens)  # Generate POS tags
        chunks = chunker.parse(pos_tags)  # Pass tagged tokens to Chunker
        result = str(chunks)  # Show chunks as text
    elif operation == "pos_tag":
        tokens = word_tokenize(text)
        pos_tags = pos_tagger.tag(tokens)
        result = " ".join([f"{token}/{tag}" for token, tag in pos_tags])  # Format: token/POS

    return result

def toggle_normalization_options(operation):
    # Show normalization options only if 'normalize' is selected
    is_normalize = (operation == "normalize")
    return [gr.update(visible=is_normalize)] * 8  # Update visibility for all 8 checkboxes

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Persian Text Processor with Hazm")
    gr.Markdown("Select an operation and, if applicable, adjust normalization parameters to process the input text using Hazm.")
    
    with gr.Row():
        input_text = gr.Textbox(lines=10, label="Input Text", placeholder="Enter Persian text here...")
    
    with gr.Row():
        operation = gr.Radio(
            choices=['normalize', 'tokenize', 'lemmatize', 'chunk', 'pos_tag'],
            label="Select Operation",
            value='normalize',
            info="Choose the type of text processing operation to perform."
        )
    
    with gr.Column(visible=True) as normalization_options:
        correct_spacing = gr.Checkbox(value=True, label="Correct Spacing", info="Adjusts spaces between words for proper formatting.")
        remove_diacritics = gr.Checkbox(value=True, label="Remove Diacritics", info="Eliminates diacritical marks from the text.")
        remove_specials_chars = gr.Checkbox(value=True, label="Remove Special Characters", info="Removes non-alphanumeric characters.")
        decrease_repeated_chars = gr.Checkbox(value=True, label="Decrease Repeated Characters", info="Reduces sequences of repeated characters to a single character.")
        persian_style = gr.Checkbox(value=True, label="Persian Style", info="Applies standard Persian typography rules.")
        persian_numbers = gr.Checkbox(value=True, label="Persian Numbers", info="Converts Arabic numbers to Persian numbers.")
        unicodes_replacement = gr.Checkbox(value=True, label="Unicodes Replacement", info="Replaces characters with their standard Unicode equivalents.")
        seperate_mi = gr.Checkbox(value=True, label="Separate 'می'", info="Separates the Persian prefix 'می' from verbs.")

    operation.change(
        fn=toggle_normalization_options,
        inputs=operation,
        outputs=normalization_options
    )
    
    output_text = gr.Textbox(label="Processed Text", lines=10, interactive=False, show_copy_button=True)
    
    submit_button = gr.Button("Process Text")
    submit_button.click(
        fn=process_text,
        inputs=[
            input_text, operation,
            correct_spacing, remove_diacritics, remove_specials_chars,
            decrease_repeated_chars, persian_style, persian_numbers,
            unicodes_replacement, seperate_mi
        ],
        outputs=output_text
    )

if __name__ == "__main__":
    demo.launch()