declare-lab
/

Emma-X

Image-Text-to-Text

Model card Files Files and versions Community

emrys-hong commited on Jan 27

Commit

e4b3b79

·

verified ·

1 Parent(s): d575b00

Update README.md

Files changed (1) hide show

README.md +7 -6

README.md CHANGED Viewed

@@ -64,6 +64,9 @@ from PIL import Image
 import torch
 # Load Emma-X
 vla = AutoModelForVision2Seq.from_pretrained(
     "declare-lab/Emma-X",
@@ -74,14 +77,12 @@ vla = AutoModelForVision2Seq.from_pretrained(
 ).to("cuda:0")
 processor = AutoProcessor.from_pretrained("declare-lab/Emma-X", trust_remote_code=True)
-image: Image.Image = get_from_camera(...)
-prompt = "In: What action should the robot take to achieve the instruction\nINSTRUCTION: \n{<Instruction here>}\n\nOut: "
-# Predict Action (action is a 7 dimensional vector to control the robot)
 inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
-action, _ = vla.generate_actions(inputs, do_sample=False, max_new_tokens=512)
 print("action", action)
 # Execute...
 robot.act(action, ...)
 ```

 import torch
+task_label = "put carrot in pot" # Change your desired task label
+image: Image.Image = get_from_camera(...)
 # Load Emma-X
 vla = AutoModelForVision2Seq.from_pretrained(
     "declare-lab/Emma-X",
 ).to("cuda:0")
 processor = AutoProcessor.from_pretrained("declare-lab/Emma-X", trust_remote_code=True)
+prompt, image = processor.get_prompt(task_label, image)
 inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
+# Predict Action (action is a 7 dimensional vector to control the robot)
+action, reasoning = vla.generate_actions(inputs, processor.tokenizer, do_sample=False, max_new_tokens=512)
 print("action", action)
 # Execute...
 robot.act(action, ...)
 ```