Ayaan-Sharif's picture
Upload T5 spelling corrector checkpoint from local
7370fab verified
#!/usr/bin/env python3
"""
Demo script for T5 Spelling Corrector Fine-tuned v3 - Indian Financial Text Domain
This script demonstrates how to load and use the model for correcting OCR errors
in Indian financial documents, particularly monetary amounts and denominations.
Run this after cloning the repository.
Requirements: pip install transformers torch safetensors
"""
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
def load_model(model_name="Ayaan-Sharif/t5-spelling-corrector-finetuned-v3"):
"""Load the model and tokenizer from Hugging Face Hub."""
print(f"Loading model: {model_name}")
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
return tokenizer, model
def correct_spelling(text, tokenizer, model, max_length=50, num_beams=4):
"""Correct spelling in the input text."""
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=max_length,
num_beams=num_beams,
early_stopping=True,
no_repeat_ngram_size=2
)
corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
return corrected
def main():
# Load model
tokenizer, model = load_model()
# Example texts with spelling errors in Indian financial context
test_texts = [
"One Hunderd Thousand Rupees",
"Five Lakh Twenty Thousand Rupees",
"Ten Crore Fifty Lakh Rupees",
"Two Thousand Five Hunderd Rupees",
"One Lakh Ninety Nine Thousand Rupees"
]
print("T5 Spelling Corrector Demo")
print("=" * 40)
for i, text in enumerate(test_texts, 1):
corrected = correct_spelling(text, tokenizer, model)
print(f"\nExample {i}:")
print(f"Original: {text}")
print(f"Corrected: {corrected}")
# Interactive mode
print("\n" + "=" * 40)
print("Interactive mode: Enter text to correct (or 'quit' to exit)")
while True:
user_input = input("Enter text: ").strip()
if user_input.lower() == 'quit':
break
if user_input:
corrected = correct_spelling(user_input, tokenizer, model)
print(f"Corrected: {corrected}")
else:
print("Please enter some text.")
if __name__ == "__main__":
main()