Upload T5 spelling corrector checkpoint from local

7370fab verified about 1 month ago

2.52 kB

	#!/usr/bin/env python3
	"""
	Demo script for T5 Spelling Corrector Fine-tuned v3 - Indian Financial Text Domain

	This script demonstrates how to load and use the model for correcting OCR errors
	in Indian financial documents, particularly monetary amounts and denominations.

	Run this after cloning the repository.

	Requirements: pip install transformers torch safetensors
	"""

	import torch
	from transformers import T5Tokenizer, T5ForConditionalGeneration

	def load_model(model_name="Ayaan-Sharif/t5-spelling-corrector-finetuned-v3"):
	"""Load the model and tokenizer from Hugging Face Hub."""
	print(f"Loading model: {model_name}")
	tokenizer = T5Tokenizer.from_pretrained(model_name)
	model = T5ForConditionalGeneration.from_pretrained(model_name)
	return tokenizer, model

	def correct_spelling(text, tokenizer, model, max_length=50, num_beams=4):
	"""Correct spelling in the input text."""
	inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=max_length,
	num_beams=num_beams,
	early_stopping=True,
	no_repeat_ngram_size=2
	)
	corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return corrected

	def main():
	# Load model
	tokenizer, model = load_model()

	# Example texts with spelling errors in Indian financial context
	test_texts = [
	"One Hunderd Thousand Rupees",
	"Five Lakh Twenty Thousand Rupees",
	"Ten Crore Fifty Lakh Rupees",
	"Two Thousand Five Hunderd Rupees",
	"One Lakh Ninety Nine Thousand Rupees"
	]

	print("T5 Spelling Corrector Demo")
	print("=" * 40)

	for i, text in enumerate(test_texts, 1):
	corrected = correct_spelling(text, tokenizer, model)
	print(f"\nExample {i}:")
	print(f"Original: {text}")
	print(f"Corrected: {corrected}")

	# Interactive mode
	print("\n" + "=" * 40)
	print("Interactive mode: Enter text to correct (or 'quit' to exit)")
	while True:
	user_input = input("Enter text: ").strip()
	if user_input.lower() == 'quit':
	break
	if user_input:
	corrected = correct_spelling(user_input, tokenizer, model)
	print(f"Corrected: {corrected}")
	else:
	print("Please enter some text.")

	if __name__ == "__main__":
	main()