|
|
|
|
|
|
|
|
"""OpenCUA-7B EXL2 — Standalone Visual Inference (Streaming) |
|
|
Tested On exllamav2 0.3.2, python3.12.9, torch 2.6.0+cu126 |
|
|
- Applies a minimal, safe monkey-patch so ExLlamaV2 knows how to wire the |
|
|
OpenCUA EXL2 architecture (Qwen2.5-style vision tower + Llama-like LM). |
|
|
- Keeps vision RoPE active (DO NOT neutralize positional embeddings). |
|
|
- Chooses a valid 1-D RoPE style if available (LLAMA > HF > default). |
|
|
- Loads model + vision tower, extracts EXL2 image embeddings. |
|
|
- Builds a chat-style prompt with the image alias and user instruction. |
|
|
- Streams tokens using ExLlamaV2DynamicGenerator / DynamicJob.""" |
|
|
|
|
|
MODEL_PATH = r"C:\Users\44741\Desktop\OpenCUA-7B-exl2" |
|
|
IMAGE_URL = "http://images.cocodataset.org/val2017/000000001584.jpg" |
|
|
INSTRUCTION = "Describe in detail everything you can see." |
|
|
MAX_NEW_TOKENS = 600 |
|
|
|
|
|
import sys |
|
|
import traceback |
|
|
import torch |
|
|
from PIL import Image |
|
|
import requests |
|
|
|
|
|
|
|
|
from exllamav2.architecture import ( |
|
|
ExLlamaV2ArchParams, |
|
|
RopeStyle, |
|
|
layer_keys_llama_norms, |
|
|
layer_keys_llama_attn, |
|
|
layer_keys_llama_mlp, |
|
|
expect_keys_llama |
|
|
) |
|
|
|
|
|
print(" -- Applying OpenCUA architecture monkey-patch for inference...") |
|
|
|
|
|
_original_arch_init = ExLlamaV2ArchParams.__init__ |
|
|
|
|
|
def _patched_arch_init(self, arch_string, read_config): |
|
|
|
|
|
_original_arch_init(self, arch_string, read_config) |
|
|
|
|
|
|
|
|
if arch_string == "OpenCUAForConditionalGeneration": |
|
|
print(" -- Found OpenCUA architecture, applying keys & RoPE settings...") |
|
|
|
|
|
|
|
|
self.lm_prefix = "language_model." |
|
|
self.lm.layer_keys += ( |
|
|
layer_keys_llama_norms + layer_keys_llama_attn + layer_keys_llama_mlp |
|
|
) |
|
|
self.lm.expect_keys += expect_keys_llama |
|
|
self.lm.attention_bias_qkv = True |
|
|
self.lm.supports_tp = True |
|
|
|
|
|
|
|
|
self.vt_prefix = "vision_tower." |
|
|
read_config["vision_config"].update({"model_type": "qwen2.5"}) |
|
|
self.vt.keys.update({ |
|
|
"fused_qkv": ".attn.qkv", |
|
|
"attn_o": ".attn.proj", |
|
|
"mlp_gate": ".mlp.gate_proj", |
|
|
"mlp_up": ".mlp.up_proj", |
|
|
"mlp_down": ".mlp.down_proj", |
|
|
"norm_1": ".norm1", |
|
|
"norm_2": ".norm2", |
|
|
"layers": "blocks", |
|
|
"patch_conv": "patch_embed.proj", |
|
|
}) |
|
|
self.vt.mlp_gate = True |
|
|
self.vt.mlp_act_func = "silu" |
|
|
self.vt.norm = "rmsnorm" |
|
|
self.vt.mlp_bias = True |
|
|
self.vt.attention_bias_qkv = True |
|
|
self.vt.attention_bias_o = True |
|
|
self.vt.vision_input_norm = False |
|
|
self.vt.vision_conv3d = True |
|
|
|
|
|
|
|
|
try: |
|
|
if hasattr(RopeStyle, "LLAMA"): |
|
|
self.vt.rope_style = RopeStyle.LLAMA |
|
|
elif hasattr(RopeStyle, "HF"): |
|
|
self.vt.rope_style = RopeStyle.HF |
|
|
else: |
|
|
|
|
|
pass |
|
|
except Exception: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
self.vt.mlp_merger = True |
|
|
self.mmp_prefix = "vision_tower.merger." |
|
|
self.mmp.keys.update({ |
|
|
"mlp_gate": None, |
|
|
"mlp_up": "mlp.0", |
|
|
"mlp_down": "mlp.2", |
|
|
"norm_2": "ln_q", |
|
|
}) |
|
|
self.mmp.mlp_gate = False |
|
|
self.mmp.mlp_act_func = "gelu" |
|
|
self.mmp.mlp_bias = True |
|
|
self.mmp.norm = "layernorm" |
|
|
|
|
|
|
|
|
ExLlamaV2ArchParams.__init__ = _patched_arch_init |
|
|
print(" -- Patch applied successfully.") |
|
|
|
|
|
|
|
|
|
|
|
from exllamav2 import ( |
|
|
ExLlamaV2, |
|
|
ExLlamaV2Config, |
|
|
ExLlamaV2Cache, |
|
|
ExLlamaV2Tokenizer, |
|
|
ExLlamaV2VisionTower, |
|
|
) |
|
|
from exllamav2.generator import ( |
|
|
ExLlamaV2DynamicGenerator, |
|
|
ExLlamaV2Sampler, |
|
|
ExLlamaV2DynamicJob, |
|
|
) |
|
|
|
|
|
def main(): |
|
|
try: |
|
|
print(" -- Loading model/config...") |
|
|
config = ExLlamaV2Config(MODEL_PATH) |
|
|
|
|
|
|
|
|
|
|
|
model = ExLlamaV2(config) |
|
|
cache = ExLlamaV2Cache(model, lazy=True) |
|
|
model.load_autosplit(cache) |
|
|
|
|
|
tokenizer = ExLlamaV2Tokenizer(config) |
|
|
|
|
|
print(" -- Loading vision tower...") |
|
|
vision_tower = ExLlamaV2VisionTower(config) |
|
|
vision_tower.load() |
|
|
try: |
|
|
print(f"[Debug] vt.rope_style = {getattr(vision_tower, 'rope_style', 'n/a')}") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
generator = ExLlamaV2DynamicGenerator(model, cache, tokenizer) |
|
|
|
|
|
print(f" -- Downloading test image from: {IMAGE_URL}") |
|
|
image = Image.open(requests.get(IMAGE_URL, stream=True).raw).convert("RGB") |
|
|
|
|
|
print(" -- Processing image and building prompt...") |
|
|
image_embeddings = vision_tower.get_image_embeddings(model, tokenizer, image) |
|
|
|
|
|
|
|
|
placeholders = image_embeddings.text_alias |
|
|
prompt = ( |
|
|
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" |
|
|
f"<|im_start|>user\n{placeholders}\n{INSTRUCTION}<|im_end|>\n" |
|
|
f"<|im_start|>assistant\n" |
|
|
) |
|
|
|
|
|
|
|
|
print("\n--- Prompt Sent to Model ---") |
|
|
print(prompt.replace(image_embeddings.text_alias, "<image>")) |
|
|
print("----------------------------\n") |
|
|
|
|
|
|
|
|
print("--- Model Output (streaming) ---") |
|
|
gen_settings = ExLlamaV2Sampler.Settings.greedy() |
|
|
|
|
|
|
|
|
input_ids = tokenizer.encode( |
|
|
prompt, |
|
|
add_bos=True, |
|
|
encode_special_tokens=True, |
|
|
embeddings=[image_embeddings] |
|
|
) |
|
|
|
|
|
|
|
|
job = ExLlamaV2DynamicJob( |
|
|
input_ids=input_ids, |
|
|
max_new_tokens=MAX_NEW_TOKENS, |
|
|
decode_special_tokens=False, |
|
|
gen_settings=gen_settings, |
|
|
embeddings=[image_embeddings], |
|
|
) |
|
|
|
|
|
|
|
|
generator.enqueue(job) |
|
|
|
|
|
final_text = [] |
|
|
try: |
|
|
while generator.num_remaining_jobs(): |
|
|
results = generator.iterate() |
|
|
for r in results: |
|
|
chunk = r.get("text", "") |
|
|
if chunk: |
|
|
print(chunk, end="", flush=True) |
|
|
final_text.append(chunk) |
|
|
finally: |
|
|
print("\n\n--- Test Complete ---") |
|
|
|
|
|
|
|
|
full_output = "".join(final_text) |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
print(f"\nAn error occurred: {e}") |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
try: |
|
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
main() |
|
|
|