Ultron
Collection
Multi-modal conversation model & Multi-modal dialogue summarization model
β’
5 items
β’
Updated
β’
1
π Homepage | π» Github | π Arxiv | π PDF
π¨ Disclaimer: All models and datasets are intended for research purposes only.
import logging
from PIL import Image
import torch
from transformers import (
AutoModelForVision2Seq,
BitsAndBytesConfig,
AutoProcessor,
)
# Define Ultron template
ULTRON_TEMPLATE = 'You are an excellent image sharing system that generates <RET> token with the following image description. The image description must be provided with the following format: <RET> <h> image description </h>. The following conversation is between {name} and AI assistant on {date}. The given image is {name}\'s appearance.\n{dialogue}'
# Ultron model initialization
def load_ultron_model(model_path):
"""
Loads the Ultron model and processor.
Args:
model_path (str): Path to the pre-trained model.
Returns:
model: Loaded Vision-to-Seq model.
processor: Corresponding processor for the model.
"""
logging.info(f"Loading Ultron model from {model_path}...")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4'
)
model_kwargs = dict(
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(
'meta-llama/Llama-3.2-11B-Vision-Instruct', torch_dtype=torch.bfloat16
)
model = AutoModelForVision2Seq.from_pretrained(
model_path,
**model_kwargs
).eval()
logging.info("Ultron model loaded successfully.")
return model, processor
# Run Ultron model
def run_ultron_model(model, processor, dialogue, name='Tom', date='2023.04.20', face_image_path='sample_face.png'):
"""
Runs the Ultron model with a given dialogue, name, and image.
Args:
model: Pre-trained model instance.
processor: Processor for model input.
dialogue (str): Input dialogue for the assistant.
name (str): Name of the user.
date (str): Date of the conversation.
face_image_path (str): Path to the face image file.
Returns:
str: Description of the shared image.
"""
logging.info("Running Ultron model...")
face_image = Image.open(face_image_path).convert("RGB")
prompt = ULTRON_TEMPLATE.format(
dialogue=dialogue,
name=name,
date=date
)
messages = [
{
"content": [
{"text": prompt, "type": "text"},
{"type": "image"}
],
"role": "user"
},
]
logging.info("Preparing input for Ultron model...")
prompt_input = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(face_image, prompt_input, return_tensors='pt').to('cuda')
with torch.inference_mode():
logging.info("Generating output from Ultron model...")
output = model.generate(
**inputs,
do_sample=True,
temperature=0.9,
max_new_tokens=512,
top_p=1.0,
use_cache=True,
num_beams=1,
)
output_text = processor.decode(output[0], skip_special_token=True)
logging.info("Output generated successfully from Ultron model.")
return parse_ultron_output(output_text)
# Parse Ultron output
def parse_ultron_output(output):
"""
Parses the output to extract the image description.
Args:
output (str): The generated output text from the model.
Returns:
str: Extracted image description.
"""
logging.info("Parsing output from Ultron model...")
if '<RET>' in output:
return output.split('<h>')[-1].split('</h>')[0].strip()
else:
logging.warning("<RET> not found in output.")
return output
# Example usage
def main():
"""
Example usage of Ultron model.
"""
model_path = "passing2961/Ultron-11B"
model, processor = load_ultron_model(model_path)
dialogue = """Tom: I have so much work at the office, I'm exhausted...
Personal AI Assistant: How can I help you feel less tired?
Tom: Hmm.. I miss my dog Star at home.
Personal AI Assistant: """
image_description = run_ultron_model(model, processor, dialogue)
logging.info(f"Image description generated: {image_description}")
if __name__ == "__main__":
main()
π¨ Ultron-11B is intended to be used for research purposes only.
This work was supported by a grant of the KAIST-KT joint research project through AI Tech Lab, Institute of convergence Technology, funded by KT [Project No. G01230605, Development of Task-oriented Persona-based Dialogue Generation Combining Multi-modal Interaction and Knowledge Modeling].
If you find the resources in this repository useful, please cite our work:
@article{lee2024stark,
title={Stark: Social Long-Term Multi-Modal Conversation with Persona Commonsense Knowledge},
author={Lee, Young-Jun and Lee, Dokyong and Youn, Junyoung and Oh, Kyeongjin and Ko, Byungsoo and Hyeon, Jonghwan and Choi, Ho-Jin},
journal={arXiv preprint arXiv:2407.03958},
year={2024}
}
Base model
meta-llama/Llama-3.2-11B-Vision-Instruct