File size: 4,755 Bytes
545e8f6 790054a 545e8f6 2ffc141 545e8f6 e5c4fc7 545e8f6 790054a 545e8f6 e5c4fc7 545e8f6 790054a 9aaf6ea 790054a 2cc0a3e 545e8f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
---
base_model: openbmb/MiniCPM-Llama3-V-2_5
library_name: peft
license: mit
datasets:
- magistermilitum/Tridis
- CATMuS/medieval
language:
- la
- fr
- es
- de
pipeline_tag: image-text-to-text
---
# Model Card for Model ID
This is a first VLLM model able to switch PEFT adapters between two transcription styles for Western ancient manuscripts:
- **ABBreviated style**: Keeping the original abbreviations from the manuscripts using MUFI characters.
- **NOT_ABBreviated style** : Developping the abbreviations and symbols used in the manuscript to produce a normalized text.
### Model Description
<!-- Provide a longer summary of what this model is. -->
- **Developed by:** [Sergio Torres Aguilar]
- **Model type:** [Multimodal]
- **Language(s) (NLP):** [Latin, French, Spanish, German]
- **License:** [MIT]
## Uses
The model use two light PEFT adapter added to the MiniCPM-Llama3-V-2_5 (2024)
## How to Get Started with the Model
The following code is intended to produce both transcription styles based on a folder containing graphical manuscripts lines:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
from PIL import Image
import os
from tqdm import tqdm
import json
# Configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = "openbmb/MiniCPM-Llama3-V-2_5"
abbr_adapters = "magistermilitum/Tridis_HTR_MiniCPM_ABBR"
not_abbr_adapters = "magistermilitum/Tridis_HTR_MiniCPM"
image_folder = "/your/images/folder/path"
class TranscriptionModel:
"""Handles model loading, adapter switching, and transcription generation."""
def __init__(self, model_name, abbr_adapters, not_abbr_adapters, device):
self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
self.base_model = AutoModelForCausalLM.from_pretrained(
model_name, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16, token=True
)
self.base_model = PeftModel.from_pretrained(self.base_model, abbr_adapters, adapter_name="ABBR")
self.base_model.load_adapter(not_abbr_adapters, adapter_name="NOT_ABBR")
self.base_model.set_adapter("ABBR") # Set default adapter
self.base_model.to(device).eval()
def generate(self, adapter, image):
"""Generate transcription for the given adapter and image."""
if hasattr(self.base_model, "past_key_values"):
self.base_model.past_key_values = None
self.base_model.set_adapter(adapter)
msgs = [{"role": "user", "content": [f"Transcribe this manuscript line in mode <{adapter}>:", image]}]
with torch.no_grad():
res = self.base_model.chat(image=image, msgs=msgs, tokenizer=self.tokenizer, max_new_tokens=128)
# Remove <ABBR> and <NOT_ABBR> tokens from the output
res = res.replace(f"<{adapter}>", "").replace(f"</{adapter}>", "")
return res
class TranscriptionPipeline:
"""Handles image processing, transcription, and result saving."""
def __init__(self, model, image_folder):
self.model = model
self.image_folder = image_folder
def run_inference(self):
"""Process all images in the folder and generate transcriptions."""
results = []
for image_file in tqdm([f for f in os.listdir(self.image_folder)[:20] if f.endswith(('.png', '.jpg', '.jpeg'))]):
image = Image.open(os.path.join(self.image_folder, image_file)).convert("RGB")
print(f"\nProcessing image: {image_file}")
# Generate transcriptions for both adapters
transcriptions = {
adapter: self.model.generate(adapter, image)
for adapter in ["ABBR", "NOT_ABBR"]
}
for adapter, res in transcriptions.items():
print(f"Mode ({adapter}): {res}")
results.append({"image": image_file, "transcriptions": transcriptions})
#image.show() #Optional
# Save results to a JSON file
with open("transcriptions_results.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
# Initialize and run the pipeline
model = TranscriptionModel(model_name, abbr_adapters, not_abbr_adapters, device)
TranscriptionPipeline(model, image_folder).run_inference()
```
## Citation
```bibtex
@misc{torres_aguilar:hal-04983305,
title={Dual-Style Transcription of Historical Manuscripts based on Multimodal Small Language Models with Switchable Adapters},
author={Torres Aguilar, Sergio},
url={https://hal.science/hal-04983305},
year={2025},
note = {working paper or preprint}
}
```
- PEFT 0.14.1.dev0 |