Abhiya's picture
Update app.py
d964e98 verified
import gradio as gr
from model import BPEPunjabiTokenizer
# Initialize tokenizer globally
tokenizer = None
def initialize_tokenizer():
global tokenizer
try:
# Load the pre-trained tokenizer instead of training
# Load the pre-trained tokenizer
tokenizer = BPEPunjabiTokenizer.load(directory="./saved_models", filename="bpe_tokenizer.pkl")
return "✅ Tokenizer loaded successfully!"
except Exception as e:
return f"❌ Error loading tokenizer: {str(e)}"
def process_text(text):
if tokenizer is None:
return "Please initialize the tokenizer first!", "", "", "", ""
try:
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
compression_ratio = len(text) / len(encoded) if len(encoded) > 0 else 0
return text, f"Tokens: {encoded}", decoded, f"Match: {decoded == text}", f"Compression Ratio: {compression_ratio:.2f}"
except Exception as e:
return f"Error: {str(e)}", "", "", "", ""
def decode_tokens(token_string):
if tokenizer is None:
return "Please initialize the tokenizer first!"
try:
# Sanitize input by removing commas and ensuring valid integer conversion
sanitized_tokens = token_string.replace(',', '').strip()
tokens = list(map(int, sanitized_tokens.split()))
return tokenizer.decode(tokens)
except ValueError:
return "Error decoding tokens: Please enter space-separated integers only."
except Exception as e:
return f"Error decoding tokens: {str(e)}"
# Create the Gradio interface
with gr.Blocks(title="Punjabi BPE Tokenizer") as demo:
gr.Markdown("""
# Punjabi BPE Tokenizer
This tokenizer implements Byte-Pair Encoding for Punjabi text.
**Steps to use:**
1. Click 'Initialize Tokenizer' button and wait for tokenizer initilization (approx 45 sec).
2. Enter Punjabi text in the input box or use the example sentences
3. Click 'Process'
4. Optionally, input token sequences to decode manually.
""")
# Initialize section
with gr.Row():
init_btn = gr.Button("Initialize Tokenizer", variant="primary")
status = gr.Textbox(label="Status", interactive=False)
# Main interface
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Punjabi Text",
placeholder="ਇੱਥੇ ਪੰਜਾਬੀ ਟੈਕਸਟ ਲਿਖੋ",
lines=3
)
process_btn = gr.Button("Process", variant="secondary")
with gr.Column():
original_out = gr.Textbox(label="Original Text", interactive=False)
encoded_out = gr.Textbox(label="Encoded Tokens", interactive=False)
#decoded_out = gr.Textbox(label="Decoded Text", interactive=False)
#match_out = gr.Textbox(label="Original vs Decoded Match", interactive=False)
#compression_out = gr.Textbox(label="Compression Ratio", interactive=False)
# Token decoding panel
with gr.Row():
token_input = gr.Textbox(
label="Input Tokens (space-separated integers)",
placeholder="e.g., 1 23 45",
lines=2
)
decode_btn = gr.Button("Decode Tokens", variant="secondary")
token_decoded_out = gr.Textbox(label="Decoded from Tokens", interactive=False)
# Example sentences
gr.Examples(
examples=[
["ਤੁਸੀਂ ਕੀ ਕਰ ਰਹੇ ਹੋ?"],
["ਮੈਨੂੰ ਚਾਹ ਪੀਣੀ ਹੈ"],
["ਇਹ ਬਹੁਤ ਵਧੀਆ ਹੈ"],
["ਇਹ ਕਿਤਾਬ ਬਹੁਤ ਦਿਲਚਸਪ ਹੈ"]
],
inputs=text_input,
outputs=[original_out, encoded_out], # decoded_out, match_out, compression_out],
fn=process_text,
cache_examples=True
)
# Set up event handlers
init_btn.click(
fn=initialize_tokenizer,
outputs=status
)
process_btn.click(
fn=process_text,
inputs=text_input,
outputs=[original_out, encoded_out] # decoded_out, match_out, compression_out]
)
decode_btn.click(
fn=decode_tokens,
inputs=token_input,
outputs=token_decoded_out
)
# Also process on pressing Enter in the input box
text_input.submit(
fn=process_text,
inputs=text_input,
outputs=[original_out, encoded_out] # decoded_out, match_out, compression_out]
)
if __name__ == "__main__":
demo.launch()