|
|
|
|
|
"""
|
|
|
Demo script for T5 Spelling Corrector Fine-tuned v3 - Indian Financial Text Domain
|
|
|
|
|
|
This script demonstrates how to load and use the model for correcting OCR errors
|
|
|
in Indian financial documents, particularly monetary amounts and denominations.
|
|
|
|
|
|
Run this after cloning the repository.
|
|
|
|
|
|
Requirements: pip install transformers torch safetensors
|
|
|
"""
|
|
|
|
|
|
import torch
|
|
|
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
|
|
|
|
|
def load_model(model_name="Ayaan-Sharif/t5-spelling-corrector-finetuned-v3"):
|
|
|
"""Load the model and tokenizer from Hugging Face Hub."""
|
|
|
print(f"Loading model: {model_name}")
|
|
|
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
|
|
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
|
|
return tokenizer, model
|
|
|
|
|
|
def correct_spelling(text, tokenizer, model, max_length=50, num_beams=4):
|
|
|
"""Correct spelling in the input text."""
|
|
|
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
|
|
|
with torch.no_grad():
|
|
|
outputs = model.generate(
|
|
|
**inputs,
|
|
|
max_length=max_length,
|
|
|
num_beams=num_beams,
|
|
|
early_stopping=True,
|
|
|
no_repeat_ngram_size=2
|
|
|
)
|
|
|
corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
return corrected
|
|
|
|
|
|
def main():
|
|
|
|
|
|
tokenizer, model = load_model()
|
|
|
|
|
|
|
|
|
test_texts = [
|
|
|
"One Hunderd Thousand Rupees",
|
|
|
"Five Lakh Twenty Thousand Rupees",
|
|
|
"Ten Crore Fifty Lakh Rupees",
|
|
|
"Two Thousand Five Hunderd Rupees",
|
|
|
"One Lakh Ninety Nine Thousand Rupees"
|
|
|
]
|
|
|
|
|
|
print("T5 Spelling Corrector Demo")
|
|
|
print("=" * 40)
|
|
|
|
|
|
for i, text in enumerate(test_texts, 1):
|
|
|
corrected = correct_spelling(text, tokenizer, model)
|
|
|
print(f"\nExample {i}:")
|
|
|
print(f"Original: {text}")
|
|
|
print(f"Corrected: {corrected}")
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 40)
|
|
|
print("Interactive mode: Enter text to correct (or 'quit' to exit)")
|
|
|
while True:
|
|
|
user_input = input("Enter text: ").strip()
|
|
|
if user_input.lower() == 'quit':
|
|
|
break
|
|
|
if user_input:
|
|
|
corrected = correct_spelling(user_input, tokenizer, model)
|
|
|
print(f"Corrected: {corrected}")
|
|
|
else:
|
|
|
print("Please enter some text.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |