Spaces:
Running
Running
File size: 4,651 Bytes
d14409f cc082f0 d14409f cc082f0 d14409f 3037d70 33901fb 5c9ffe3 3037d70 d14409f 3037d70 d14409f 3037d70 cc082f0 3037d70 0922873 d14409f 0922873 d8cb719 0922873 33901fb 0922873 d8cb719 0922873 d8cb719 0922873 d8cb719 0922873 d14409f 0922873 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import gradio as gr
from hazm import Normalizer, word_tokenize, Lemmatizer, POSTagger, Chunker
# Initialize Hazm components
lemmatizer = Lemmatizer()
pos_tagger = POSTagger(model='resources/pos_tagger.model') # Load POS Tagger model
chunker = Chunker(model='resources/chunker.model') # Load Chunker model
def process_text(text, operation, correct_spacing, remove_diacritics, remove_specials_chars, decrease_repeated_chars, persian_style, persian_numbers, unicodes_replacement, seperate_mi):
# Initialize the Normalizer with user-selected parameters
normalizer = Normalizer(
correct_spacing=correct_spacing,
remove_diacritics=remove_diacritics,
remove_specials_chars=remove_specials_chars,
decrease_repeated_chars=decrease_repeated_chars,
persian_style=persian_style,
persian_numbers=persian_numbers,
unicodes_replacement=unicodes_replacement,
seperate_mi=seperate_mi
)
result = ""
if operation == "normalize":
result = normalizer.normalize(text)
elif operation == "tokenize":
tokens = word_tokenize(text)
result = " ".join(tokens) # Show tokens as a space-separated string
elif operation == "lemmatize":
lemmas = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]
result = " ".join(lemmas) # Show lemmas as a space-separated string
elif operation == "chunk":
# Tokenize and tag the input text
tokens = word_tokenize(text)
pos_tags = pos_tagger.tag(tokens) # Generate POS tags
chunks = chunker.parse(pos_tags) # Pass tagged tokens to Chunker
result = str(chunks) # Show chunks as text
elif operation == "pos_tag":
tokens = word_tokenize(text)
pos_tags = pos_tagger.tag(tokens)
result = " ".join([f"{token}/{tag}" for token, tag in pos_tags]) # Format: token/POS
return result
def toggle_normalization_options(operation):
# Show normalization options only if 'normalize' is selected
is_normalize = (operation == "normalize")
return [gr.update(visible=is_normalize)] * 8 # Update visibility for all 8 checkboxes
# Define Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Persian Text Processor with Hazm")
gr.Markdown("Select an operation and, if applicable, adjust normalization parameters to process the input text using Hazm.")
with gr.Row():
input_text = gr.Textbox(lines=10, label="Input Text", placeholder="Enter Persian text here...")
with gr.Row():
operation = gr.Radio(
choices=['normalize', 'tokenize', 'lemmatize', 'chunk', 'pos_tag'],
label="Select Operation",
value='normalize',
info="Choose the type of text processing operation to perform."
)
with gr.Column(visible=True) as normalization_options:
correct_spacing = gr.Checkbox(value=True, label="Correct Spacing", info="Adjusts spaces between words for proper formatting.")
remove_diacritics = gr.Checkbox(value=True, label="Remove Diacritics", info="Eliminates diacritical marks from the text.")
remove_specials_chars = gr.Checkbox(value=True, label="Remove Special Characters", info="Removes non-alphanumeric characters.")
decrease_repeated_chars = gr.Checkbox(value=True, label="Decrease Repeated Characters", info="Reduces sequences of repeated characters to a single character.")
persian_style = gr.Checkbox(value=True, label="Persian Style", info="Applies standard Persian typography rules.")
persian_numbers = gr.Checkbox(value=True, label="Persian Numbers", info="Converts Arabic numbers to Persian numbers.")
unicodes_replacement = gr.Checkbox(value=True, label="Unicodes Replacement", info="Replaces characters with their standard Unicode equivalents.")
seperate_mi = gr.Checkbox(value=True, label="Separate 'می'", info="Separates the Persian prefix 'می' from verbs.")
operation.change(
fn=toggle_normalization_options,
inputs=operation,
outputs=normalization_options
)
output_text = gr.Textbox(label="Processed Text", lines=10, interactive=False, show_copy_button=True)
submit_button = gr.Button("Process Text")
submit_button.click(
fn=process_text,
inputs=[
input_text, operation,
correct_spacing, remove_diacritics, remove_specials_chars,
decrease_repeated_chars, persian_style, persian_numbers,
unicodes_replacement, seperate_mi
],
outputs=output_text
)
if __name__ == "__main__":
demo.launch()
|