happyme531
/

FastVLM-1.5B-RKLLM

Model card Files Files and versions Community

happyme531 commited on 26 days ago

Commit

6128fc3

verified ·

1 Parent(s): 27253ba

Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +5 -0
convert_fastvithd.py +52 -0
convert_mm_projector.py +52 -0
export_onnx.py +136 -0
fastvithd.rknn +3 -0
librkllmrt.so +3 -0
mm_projector.rknn +3 -0
qwen_f16.rkllm +3 -0
rkllm-convert.py +23 -0
rkllm_binding.py +510 -0
run_rknn.py +274 -0
test.jpg +3 -0
ztu_somemodelruntime_rknnlite2.py +569 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fastvithd.rknn filter=lfs diff=lfs merge=lfs -text
+librkllmrt.so filter=lfs diff=lfs merge=lfs -text
+mm_projector.rknn filter=lfs diff=lfs merge=lfs -text
+qwen_f16.rkllm filter=lfs diff=lfs merge=lfs -text
+test.jpg filter=lfs diff=lfs merge=lfs -text

convert_fastvithd.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/env python3
+# ztu_somemodelruntime_rknn2: fastvithd
+from rknn.api import RKNN
+import os
+import numpy as np
+def main():
+    # 创建RKNN实例
+    rknn = RKNN(verbose=False)
+    # ONNX模型路径
+    ONNX_MODEL = "fastvithd.onnx"
+    # 输出RKNN模型路径
+    RKNN_MODEL = "fastvithd.rknn"
+    # 配置参数
+    print("--> Config model")
+    ret = rknn.config(target_platform="rk3588",
+                      dynamic_input=None)
+    if ret != 0:
+        print('Config model failed!')
+        exit(ret)
+    # 加载ONNX模型
+    print("--> Loading model")
+    ret = rknn.load_onnx(model=ONNX_MODEL,
+                         inputs=['pixel_values'],
+                         input_size_list=[[1, 3, 1024, 1024]])
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    # 构建模型
+    print("--> Building model")
+    ret = rknn.build(do_quantization=False)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    # 导出RKNN模型
+    print("--> Export RKNN model")
+    ret = rknn.export_rknn(RKNN_MODEL)
+    if ret != 0:
+        print('Export RKNN model failed!')
+        exit(ret)
+    print(f'Done! The converted RKNN model has been saved to: ' + RKNN_MODEL)
+    rknn.release()
+if __name__ == '__main__':
+    main()

convert_mm_projector.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/env python3
+# ztu_somemodelruntime_rknn2: mm_projector
+from rknn.api import RKNN
+import os
+import numpy as np
+def main():
+    # 创建RKNN实例
+    rknn = RKNN(verbose=False)
+    # ONNX模型路径
+    ONNX_MODEL = "mm_projector.onnx"
+    # 输出RKNN模型路径
+    RKNN_MODEL = "mm_projector.rknn"
+    # 配置参数
+    print("--> Config model")
+    ret = rknn.config(target_platform="rk3588",
+                      dynamic_input=None)
+    if ret != 0:
+        print('Config model failed!')
+        exit(ret)
+    # 加载ONNX模型
+    print("--> Loading model")
+    ret = rknn.load_onnx(model=ONNX_MODEL,
+                         inputs=['last_hidden_state'],
+                         input_size_list=[[1, 256, 3072]])
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    # 构建模型
+    print("--> Building model")
+    ret = rknn.build(do_quantization=False)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    # 导出RKNN模型
+    print("--> Export RKNN model")
+    ret = rknn.export_rknn(RKNN_MODEL)
+    if ret != 0:
+        print('Export RKNN model failed!')
+        exit(ret)
+    print(f'Done! The converted RKNN model has been saved to: ' + RKNN_MODEL)
+    rknn.release()
+if __name__ == '__main__':
+    main()

export_onnx.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import os
+import json
+import copy
+import argparse
+import torch
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import get_model_name_from_path
+def export(args):
+    # Load model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path,
+                                                                           args.model_base,
+                                                                           model_name,
+                                                                           device="cpu")
+    # Save extra metadata that is not saved during LLaVA training
+    # required by HF for auto-loading model and for mlx-vlm preprocessing
+    # Save image processing config
+    setattr(image_processor, "processor_class", "LlavaProcessor")
+    output_path = os.path.join(model_path, "preprocessor_config.json")
+    image_processor.to_json_file(output_path)
+    # Create processor config
+    processor_config = dict()
+    processor_config["image_token"] = "<image>"
+    processor_config["num_additional_image_tokens"] = 0
+    processor_config["processor_class"] = "LlavaProcessor"
+    processor_config["patch_size"] = 64
+    output_path = os.path.join(model_path, "processor_config.json")
+    json.dump(processor_config, open(output_path, "w"), indent=2)
+    # Modify tokenizer to include <image> special token.
+    tokenizer_config_path = os.path.join(model_path, "tokenizer_config.json")
+    tokenizer_config = json.load(open(tokenizer_config_path, 'r'))
+    token_ids = list()
+    image_token_is_present = False
+    for k, v in tokenizer_config['added_tokens_decoder'].items():
+        token_ids.append(int(k))
+        if v["content"] == "<image>":
+            image_token_is_present = True
+            token_ids.pop()
+    # Append only if <image> token is not present
+    if not image_token_is_present:
+        tokenizer_config['added_tokens_decoder'][f'{max(token_ids) + 1}'] = copy.deepcopy(
+            tokenizer_config['added_tokens_decoder'][f'{token_ids[0]}'])
+        tokenizer_config['added_tokens_decoder'][f'{max(token_ids) + 1}']["content"] = "<image>"
+        json.dump(tokenizer_config, open(tokenizer_config_path, 'w'), indent=2)
+    # Modify config to contain token id for <image>
+    config_path = os.path.join(model_path, "config.json")
+    model_config = json.load(open(config_path, 'r'))
+    model_config["image_token_index"] = max(token_ids) + 1
+    json.dump(model_config, open(config_path, 'w'), indent=2)
+    # Export the vision encoder to ONNX
+    image_res = image_processor.to_dict()['size']['shortest_edge']
+    dummy_vision_input = torch.rand(1, 3, image_res, image_res).float() # Dummy input tensor
+    vision_model = model.get_vision_tower()
+    # Ensure model is on CPU, in float precision, and in evaluation mode for ONNX export
+    vision_model = vision_model.cpu().float().eval()
+    onnx_vision_model_path = os.path.join(model_path, "fastvithd.onnx")
+    print(f"Exporting vision model to {onnx_vision_model_path}...")
+    torch.onnx.export(
+        vision_model,
+        dummy_vision_input, # Pass the dummy input tensor
+        onnx_vision_model_path,
+        input_names=['pixel_values'],             # ONNX图中输入节点的名称
+        output_names=['last_hidden_state'],    # ONNX图中输出节点的名称
+        # dynamic_axes={
+        #     'pixel_values': {0: 'batch_size'},     # 输入'pixel_values'的第0维是动态的batch_size
+        #     'last_hidden_state': {0: 'batch_size'}  # 输出'last_hidden_state'的第0维是动态的batch_size
+        # },
+        opset_version=17,                   # ONNX opset 版本
+        export_params=True,                 # 在模型文件中存储训练好的参数权重
+        do_constant_folding=True            # 执行常量折叠优化
+    )
+    print(f"Vision model ONNX export complete: {onnx_vision_model_path}")
+    # Generate dummy input for mm_projector by passing dummy_vision_input through vision_model
+    # This ensures the mm_projector receives input with the correct shape and characteristics
+    with torch.no_grad():
+        dummy_mm_projector_input = vision_model(dummy_vision_input)
+    # Ensure the input is on CPU and in float32 precision for the projector
+    dummy_mm_projector_input = dummy_mm_projector_input.cpu().float()
+    # Export the mm_projector to ONNX
+    # model.get_model() gives the underlying base model (e.g., LlavaLlamaModel)
+    # which contains the mm_projector attribute.
+    mm_projector = model.get_model().mm_projector
+    mm_projector = mm_projector.cpu().float().eval()
+    onnx_mm_projector_path = os.path.join(model_path, "mm_projector.onnx")
+    print(f"Exporting mm_projector to {onnx_mm_projector_path}...")
+    torch.onnx.export(
+        mm_projector,
+        dummy_mm_projector_input,
+        onnx_mm_projector_path,
+        input_names=['last_hidden_state'],
+        output_names=['projected_image_features'],
+        opset_version=17,
+        export_params=True,
+        do_constant_folding=True
+    )
+    print(f"mm_projector ONNX export complete: {onnx_mm_projector_path}")
+    # Removed CoreML specific code and intermediate .pt file handling
+    # No need for os.remove(pt_name) as pt_name is no longer created
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--conv-mode", type=str, default="qwen_2")
+    args = parser.parse_args()
+    export(args)

fastvithd.rknn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d65f07758bd5fd610c76d28aae1d07a87bc65de0687f4fee5ef5c5e0f61d52a
+size 372732105

librkllmrt.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6a9c2de93cf94bb524eb071c27190ad4c83401e01b562534f265dff4cb40da2
+size 6710712

mm_projector.rknn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3f6d9b06589c9aa7b70d28ef22703579d5d3c07c53e1b6ee72be85ac4ae7ee5
+size 14272722

qwen_f16.rkllm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e13cc4849405b7338b5d21cef0f50a1776ea7c21594685e52665837cfec123c
+size 3580141646

rkllm-convert.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from rkllm.api import RKLLM
+modelpath = '.'
+llm = RKLLM()
+ret = llm.load_huggingface(model=modelpath, model_lora=None, device='cpu')
+if ret != 0:
+    print('Load model failed!')
+    exit(ret)
+qparams = None
+ret = llm.build(do_quantization=False, optimization_level=1, quantized_dtype='w8a8_g128',
+                quantized_algorithm='normal', target_platform='rk3588', num_npu_core=3, extra_qparams=qparams, dataset='calibration_dataset.json')
+if ret != 0:
+    print('Build model failed!')
+    exit(ret)
+# Export rkllm model
+ret = llm.export_rkllm("./qwen_f16.rkllm")
+if ret != 0:
+    print('Export model failed!')
+    exit(ret)

rkllm_binding.py ADDED Viewed

	@@ -0,0 +1,510 @@

+import ctypes
+import enum
+import os
+# Define constants from the header
+CPU0 = (1 << 0)  # 0x01
+CPU1 = (1 << 1)  # 0x02
+CPU2 = (1 << 2)  # 0x04
+CPU3 = (1 << 3)  # 0x08
+CPU4 = (1 << 4)  # 0x10
+CPU5 = (1 << 5)  # 0x20
+CPU6 = (1 << 6)  # 0x40
+CPU7 = (1 << 7)  # 0x80
+# --- Enums ---
+class LLMCallState(enum.IntEnum):
+    RKLLM_RUN_NORMAL = 0
+    RKLLM_RUN_WAITING = 1
+    RKLLM_RUN_FINISH = 2
+    RKLLM_RUN_ERROR = 3
+class RKLLMInputType(enum.IntEnum):
+    RKLLM_INPUT_PROMPT = 0
+    RKLLM_INPUT_TOKEN = 1
+    RKLLM_INPUT_EMBED = 2
+    RKLLM_INPUT_MULTIMODAL = 3
+class RKLLMInferMode(enum.IntEnum):
+    RKLLM_INFER_GENERATE = 0
+    RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1
+    RKLLM_INFER_GET_LOGITS = 2
+# --- Structures ---
+class RKLLMExtendParam(ctypes.Structure):
+    _fields_ = [
+        ("base_domain_id", ctypes.c_int32),
+        ("embed_flash", ctypes.c_int8),
+        ("enabled_cpus_num", ctypes.c_int8),
+        ("enabled_cpus_mask", ctypes.c_uint32),
+        ("reserved", ctypes.c_uint8 * 106)
+    ]
+class RKLLMParam(ctypes.Structure):
+    _fields_ = [
+        ("model_path", ctypes.c_char_p),
+        ("max_context_len", ctypes.c_int32),
+        ("max_new_tokens", ctypes.c_int32),
+        ("top_k", ctypes.c_int32),
+        ("n_keep", ctypes.c_int32),
+        ("top_p", ctypes.c_float),
+        ("temperature", ctypes.c_float),
+        ("repeat_penalty", ctypes.c_float),
+        ("frequency_penalty", ctypes.c_float),
+        ("presence_penalty", ctypes.c_float), # Note: This was missing in the provided text but is in typical LLM params
+        ("mirostat", ctypes.c_int32),
+        ("mirostat_tau", ctypes.c_float),
+        ("mirostat_eta", ctypes.c_float),
+        ("skip_special_token", ctypes.c_bool),
+        ("is_async", ctypes.c_bool),
+        ("img_start", ctypes.c_char_p),
+        ("img_end", ctypes.c_char_p),
+        ("img_content", ctypes.c_char_p), # This seems like it should be more structured for actual image data
+        ("extend_param", RKLLMExtendParam)
+    ]
+class RKLLMLoraAdapter(ctypes.Structure):
+    _fields_ = [
+        ("lora_adapter_path", ctypes.c_char_p),
+        ("lora_adapter_name", ctypes.c_char_p),
+        ("scale", ctypes.c_float)
+    ]
+class RKLLMEmbedInput(ctypes.Structure):
+    _fields_ = [
+        ("embed", ctypes.POINTER(ctypes.c_float)),
+        ("n_tokens", ctypes.c_size_t)
+    ]
+class RKLLMTokenInput(ctypes.Structure):
+    _fields_ = [
+        ("input_ids", ctypes.POINTER(ctypes.c_int32)),
+        ("n_tokens", ctypes.c_size_t)
+    ]
+class RKLLMMultiModelInput(ctypes.Structure):
+    _fields_ = [
+        ("prompt", ctypes.c_char_p),
+        ("image_embed", ctypes.POINTER(ctypes.c_float)),
+        ("n_image_tokens", ctypes.c_size_t),
+        ("n_image", ctypes.c_size_t),
+        ("image_width", ctypes.c_size_t),
+        ("image_height", ctypes.c_size_t)
+    ]
+class _RKLLMInputUnion(ctypes.Union):
+    _fields_ = [
+        ("prompt_input", ctypes.c_char_p),
+        ("embed_input", RKLLMEmbedInput),
+        ("token_input", RKLLMTokenInput),
+        ("multimodal_input", RKLLMMultiModelInput)
+    ]
+class RKLLMInput(ctypes.Structure):
+    _fields_ = [
+        ("input_type", ctypes.c_int), # Enum will be passed as int, changed RKLLMInputType to ctypes.c_int
+        ("_union_data", _RKLLMInputUnion)
+    ]
+    # Properties to make accessing union members easier
+    @property
+    def prompt_input(self):
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
+            return self._union_data.prompt_input
+        raise AttributeError("Not a prompt input")
+    @prompt_input.setter
+    def prompt_input(self, value):
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
+            self._union_data.prompt_input = value
+        else:
+            raise AttributeError("Not a prompt input")
+    # Similar properties can be added for embed_input, token_input, multimodal_input
+class RKLLMLoraParam(ctypes.Structure): # For inference
+    _fields_ = [
+        ("lora_adapter_name", ctypes.c_char_p)
+    ]
+class RKLLMPromptCacheParam(ctypes.Structure): # For inference
+    _fields_ = [
+        ("save_prompt_cache", ctypes.c_int), # bool-like
+        ("prompt_cache_path", ctypes.c_char_p)
+    ]
+class RKLLMInferParam(ctypes.Structure):
+    _fields_ = [
+        ("mode", ctypes.c_int), # Enum will be passed as int, changed RKLLMInferMode to ctypes.c_int
+        ("lora_params", ctypes.POINTER(RKLLMLoraParam)),
+        ("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
+        ("keep_history", ctypes.c_int) # bool-like
+    ]
+class RKLLMResultLastHiddenLayer(ctypes.Structure):
+    _fields_ = [
+        ("hidden_states", ctypes.POINTER(ctypes.c_float)),
+        ("embd_size", ctypes.c_int),
+        ("num_tokens", ctypes.c_int)
+    ]
+class RKLLMResultLogits(ctypes.Structure):
+    _fields_ = [
+        ("logits", ctypes.POINTER(ctypes.c_float)),
+        ("vocab_size", ctypes.c_int),
+        ("num_tokens", ctypes.c_int)
+    ]
+class RKLLMResult(ctypes.Structure):
+    _fields_ = [
+        ("text", ctypes.c_char_p),
+        ("token_id", ctypes.c_int32),
+        ("last_hidden_layer", RKLLMResultLastHiddenLayer),
+        ("logits", RKLLMResultLogits)
+    ]
+# --- Typedefs ---
+LLMHandle = ctypes.c_void_p
+# --- Callback Function Type ---
+LLMResultCallback = ctypes.CFUNCTYPE(
+    None,  # return type: void
+    ctypes.POINTER(RKLLMResult),
+    ctypes.c_void_p,  # userdata
+    ctypes.c_int      # enum, will be passed as int. Changed LLMCallState to ctypes.c_int
+)
+class RKLLMRuntime:
+    def __init__(self, library_path="./librkllmrt.so"):
+        try:
+            self.lib = ctypes.CDLL(library_path)
+        except OSError as e:
+            raise OSError(f"Failed to load RKLLM library from {library_path}. "
+                          f"Ensure it's in your LD_LIBRARY_PATH or provide the full path. Error: {e}")
+        self._setup_functions()
+        self.llm_handle = LLMHandle()
+        self._c_callback = None # To keep the callback object alive
+    def _setup_functions(self):
+        # RKLLMParam rkllm_createDefaultParam();
+        self.lib.rkllm_createDefaultParam.restype = RKLLMParam
+        self.lib.rkllm_createDefaultParam.argtypes = []
+        # int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback);
+        self.lib.rkllm_init.restype = ctypes.c_int
+        self.lib.rkllm_init.argtypes = [
+            ctypes.POINTER(LLMHandle),
+            ctypes.POINTER(RKLLMParam),
+            LLMResultCallback
+        ]
+        # int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter);
+        self.lib.rkllm_load_lora.restype = ctypes.c_int
+        self.lib.rkllm_load_lora.argtypes = [LLMHandle, ctypes.POINTER(RKLLMLoraAdapter)]
+        # int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path);
+        self.lib.rkllm_load_prompt_cache.restype = ctypes.c_int
+        self.lib.rkllm_load_prompt_cache.argtypes = [LLMHandle, ctypes.c_char_p]
+        # int rkllm_release_prompt_cache(LLMHandle handle);
+        self.lib.rkllm_release_prompt_cache.restype = ctypes.c_int
+        self.lib.rkllm_release_prompt_cache.argtypes = [LLMHandle]
+        # int rkllm_destroy(LLMHandle handle);
+        self.lib.rkllm_destroy.restype = ctypes.c_int
+        self.lib.rkllm_destroy.argtypes = [LLMHandle]
+        # int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
+        self.lib.rkllm_run.restype = ctypes.c_int
+        self.lib.rkllm_run.argtypes = [
+            LLMHandle,
+            ctypes.POINTER(RKLLMInput),
+            ctypes.POINTER(RKLLMInferParam),
+            ctypes.c_void_p # userdata
+        ]
+        # int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
+        # Assuming async also takes userdata for the callback context
+        self.lib.rkllm_run_async.restype = ctypes.c_int
+        self.lib.rkllm_run_async.argtypes = [
+            LLMHandle,
+            ctypes.POINTER(RKLLMInput),
+            ctypes.POINTER(RKLLMInferParam),
+            ctypes.c_void_p # userdata
+        ]
+        # int rkllm_abort(LLMHandle handle);
+        self.lib.rkllm_abort.restype = ctypes.c_int
+        self.lib.rkllm_abort.argtypes = [LLMHandle]
+        # int rkllm_is_running(LLMHandle handle);
+        self.lib.rkllm_is_running.restype = ctypes.c_int # 0 if running, non-zero otherwise
+        self.lib.rkllm_is_running.argtypes = [LLMHandle]
+        # int rkllm_clear_kv_cache(LLMHandle handle, int keep_system_prompt);
+        self.lib.rkllm_clear_kv_cache.restype = ctypes.c_int
+        self.lib.rkllm_clear_kv_cache.argtypes = [LLMHandle, ctypes.c_int]
+        # int rkllm_set_chat_template(LLMHandle handle, const char* system_prompt, const char* prompt_prefix, const char* prompt_postfix);
+        self.lib.rkllm_set_chat_template.restype = ctypes.c_int
+        self.lib.rkllm_set_chat_template.argtypes = [
+            LLMHandle,
+            ctypes.c_char_p,
+            ctypes.c_char_p,
+            ctypes.c_char_p
+        ]
+    def create_default_param(self) -> RKLLMParam:
+        """Creates a default RKLLMParam structure."""
+        return self.lib.rkllm_createDefaultParam()
+    def init(self, param: RKLLMParam, callback_func) -> int:
+        """
+        Initializes the LLM.
+        :param param: RKLLMParam structure.
+        :param callback_func: A Python function that matches the signature:
+                              def my_callback(result_ptr, userdata_ptr, state_enum):
+                                  result = result_ptr.contents # RKLLMResult
+                                  # Process result
+                                  # userdata can be retrieved if passed during run, or ignored
+                                  # state = LLMCallState(state_enum)
+        :return: 0 for success, non-zero for failure.
+        """
+        if not callable(callback_func):
+            raise ValueError("callback_func must be a callable Python function.")
+        # Keep a reference to the ctypes callback object to prevent it from being garbage collected
+        self._c_callback = LLMResultCallback(callback_func)
+        ret = self.lib.rkllm_init(ctypes.byref(self.llm_handle), ctypes.byref(param), self._c_callback)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_init failed with error code {ret}")
+        return ret
+    def load_lora(self, lora_adapter: RKLLMLoraAdapter) -> int:
+        """Loads a Lora adapter."""
+        ret = self.lib.rkllm_load_lora(self.llm_handle, ctypes.byref(lora_adapter))
+        if ret != 0:
+            raise RuntimeError(f"rkllm_load_lora failed with error code {ret}")
+        return ret
+    def load_prompt_cache(self, prompt_cache_path: str) -> int:
+        """Loads a prompt cache from a file."""
+        c_path = prompt_cache_path.encode('utf-8')
+        ret = self.lib.rkllm_load_prompt_cache(self.llm_handle, c_path)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_load_prompt_cache failed for {prompt_cache_path} with error code {ret}")
+        return ret
+    def release_prompt_cache(self) -> int:
+        """Releases the prompt cache from memory."""
+        ret = self.lib.rkllm_release_prompt_cache(self.llm_handle)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_release_prompt_cache failed with error code {ret}")
+        return ret
+    def destroy(self) -> int:
+        """Destroys the LLM instance and releases resources."""
+        if self.llm_handle and self.llm_handle.value: # Check if handle is not NULL
+            ret = self.lib.rkllm_destroy(self.llm_handle)
+            self.llm_handle = LLMHandle() # Reset handle
+            if ret != 0:
+                # Don't raise here as it might be called in __del__
+                print(f"Warning: rkllm_destroy failed with error code {ret}")
+            return ret
+        return 0 # Already destroyed or not initialized
+    def run(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
+        """Runs an LLM inference task synchronously."""
+        # userdata can be a ctypes.py_object if you want to pass Python objects,
+        # then cast to c_void_p. Or simply None.
+        c_userdata = ctypes.cast(ctypes.py_object(userdata), ctypes.c_void_p) if userdata is not None else None
+        ret = self.lib.rkllm_run(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_run failed with error code {ret}")
+        return ret
+    def run_async(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
+        """Runs an LLM inference task asynchronously."""
+        c_userdata = ctypes.cast(ctypes.py_object(userdata), ctypes.c_void_p) if userdata is not None else None
+        ret = self.lib.rkllm_run_async(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_run_async failed with error code {ret}")
+        return ret
+    def abort(self) -> int:
+        """Aborts an ongoing LLM task."""
+        ret = self.lib.rkllm_abort(self.llm_handle)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_abort failed with error code {ret}")
+        return ret
+    def is_running(self) -> bool:
+        """Checks if an LLM task is currently running. Returns True if running."""
+        # The C API returns 0 if running, non-zero otherwise.
+        # This is a bit counter-intuitive for a boolean "is_running".
+        return self.lib.rkllm_is_running(self.llm_handle) == 0
+    def clear_kv_cache(self, keep_system_prompt: bool) -> int:
+        """Clears the key-value cache."""
+        ret = self.lib.rkllm_clear_kv_cache(self.llm_handle, ctypes.c_int(1 if keep_system_prompt else 0))
+        if ret != 0:
+            raise RuntimeError(f"rkllm_clear_kv_cache failed with error code {ret}")
+        return ret
+    def set_chat_template(self, system_prompt: str, prompt_prefix: str, prompt_postfix: str) -> int:
+        """Sets the chat template for the LLM."""
+        c_system = system_prompt.encode('utf-8') if system_prompt else None
+        c_prefix = prompt_prefix.encode('utf-8') if prompt_prefix else None
+        c_postfix = prompt_postfix.encode('utf-8') if prompt_postfix else None
+        ret = self.lib.rkllm_set_chat_template(self.llm_handle, c_system, c_prefix, c_postfix)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_set_chat_template failed with error code {ret}")
+        return ret
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.destroy()
+    def __del__(self):
+        self.destroy() # Ensure resources are freed if object is garbage collected
+# --- Example Usage (Illustrative) ---
+if __name__ == "__main__":
+    # This is a placeholder for how you might use it.
+    # You'll need a valid .rkllm model and librkllmrt.so in your path.
+    # Global list to store results from callback for demonstration
+    results_buffer = []
+    def my_python_callback(result_ptr, userdata_ptr, state_enum):
+        """
+        Callback function to be called by the C library.
+        """
+        global results_buffer
+        state = LLMCallState(state_enum)
+        result = result_ptr.contents
+        current_text = ""
+        if result.text: # Check if the char_p is not NULL
+            current_text = result.text.decode('utf-8', errors='ignore')
+        print(f"Callback: State={state.name}, TokenID={result.token_id}, Text='{current_text}'")
+        results_buffer.append(current_text)
+        if state == LLMCallState.RKLLM_RUN_FINISH:
+            print("Inference finished.")
+        elif state == LLMCallState.RKLLM_RUN_ERROR:
+            print("Inference error.")
+        # Example: Accessing logits if available (and if mode was set to get logits)
+        # if result.logits.logits and result.logits.vocab_size > 0:
+        #     print(f"  Logits (first 5 of vocab_size {result.logits.vocab_size}):")
+        #     for i in range(min(5, result.logits.vocab_size)):
+        #         print(f"    {result.logits.logits[i]:.4f}", end=" ")
+        #     print()
+    # --- Attempt to use the wrapper ---
+    try:
+        print("Initializing RKLLMRuntime...")
+        # Adjust library_path if librkllmrt.so is not in default search paths
+        # e.g., library_path="./path/to/librkllmrt.so"
+        rk_llm = RKLLMRuntime()
+        print("Creating default parameters...")
+        params = rk_llm.create_default_param()
+        # --- Configure parameters ---
+        # THIS IS CRITICAL: model_path must point to an actual .rkllm file
+        # For this example to run, you need a model file.
+        # Let's assume a dummy path for now, this will fail at init if not valid.
+        model_file = "dummy_model.rkllm"
+        if not os.path.exists(model_file):
+            print(f"Warning: Model file '{model_file}' does not exist. Init will likely fail.")
+            # Create a dummy file for the example to proceed further, though init will still fail
+            # with a real library unless it's a valid model.
+            with open(model_file, "w") as f:
+                f.write("dummy content")
+        params.model_path = model_file.encode('utf-8')
+        params.max_context_len = 512
+        params.max_new_tokens = 128
+        params.top_k = 1 # Greedy
+        params.temperature = 0.7
+        params.repeat_penalty = 1.1
+        # ... set other params as needed
+        print(f"Initializing LLM with model: {params.model_path.decode()}...")
+        # This will likely fail if dummy_model.rkllm is not a valid model recognized by the library
+        try:
+            rk_llm.init(params, my_python_callback)
+            print("LLM Initialized.")
+        except RuntimeError as e:
+            print(f"Error during LLM initialization: {e}")
+            print("This is expected if 'dummy_model.rkllm' is not a valid model.")
+            print("Replace 'dummy_model.rkllm' with a real model path to test further.")
+            exit()
+        # --- Prepare input ---
+        print("Preparing input...")
+        rk_input = RKLLMInput()
+        rk_input.input_type = RKLLMInputType.RKLLM_INPUT_PROMPT
+        prompt_text = "Translate the following English text to French: 'Hello, world!'"
+        c_prompt = prompt_text.encode('utf-8')
+        rk_input._union_data.prompt_input = c_prompt # Accessing union member directly
+        # --- Prepare inference parameters ---
+        print("Preparing inference parameters...")
+        infer_params = RKLLMInferParam()
+        infer_params.mode = RKLLMInferMode.RKLLM_INFER_GENERATE
+        infer_params.keep_history = 1 # True
+        # infer_params.lora_params = None # or set up RKLLMLoraParam if using LoRA
+        # infer_params.prompt_cache_params = None # or set up RKLLMPromptCacheParam
+        # --- Run inference ---
+        print(f"Running inference with prompt: '{prompt_text}'")
+        results_buffer.clear()
+        try:
+            rk_llm.run(rk_input, infer_params) # Userdata is None by default
+            print("\n--- Full Response ---")
+            print("".join(results_buffer))
+            print("---------------------\n")
+        except RuntimeError as e:
+            print(f"Error during LLM run: {e}")
+        # --- Example: Set chat template (if model supports it) ---
+        # print("Setting chat template...")
+        # try:
+        #     rk_llm.set_chat_template("You are a helpful assistant.", "<user>: ", "<assistant>: ")
+        #     print("Chat template set.")
+        # except RuntimeError as e:
+        #     print(f"Error setting chat template: {e}")
+        # --- Example: Clear KV Cache ---
+        # print("Clearing KV cache (keeping system prompt if any)...")
+        # try:
+        #     rk_llm.clear_kv_cache(keep_system_prompt=True)
+        #     print("KV cache cleared.")
+        # except RuntimeError as e:
+        #     print(f"Error clearing KV cache: {e}")
+    except OSError as e:
+        print(f"OSError: {e}. Could not load the RKLLM library.")
+        print("Please ensure 'librkllmrt.so' is in your LD_LIBRARY_PATH or provide the full path.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+    finally:
+        if 'rk_llm' in locals() and rk_llm.llm_handle and rk_llm.llm_handle.value:
+            print("Destroying LLM instance...")
+            rk_llm.destroy()
+            print("LLM instance destroyed.")
+        if os.path.exists(model_file) and model_file == "dummy_model.rkllm":
+             os.remove(model_file) # Clean up dummy file
+    print("Example finished.")

run_rknn.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import faulthandler
+faulthandler.enable()
+import os
+import time
+import numpy as np
+from rkllm_binding import *
+import ztu_somemodelruntime_rknnlite2 as ort
+import signal
+import cv2
+import ctypes
+# --- Configuration ---
+# These paths should point to the directory containing all model files
+# or be absolute paths.
+MODEL_DIR = "." # Assuming models are in the current directory or provide a specific path
+LLM_MODEL_NAME = "qwen_f16.rkllm"
+VISION_ENCODER_ONNX_NAME = "fastvithd.onnx"
+MM_PROJECTOR_ONNX_NAME = "mm_projector.onnx"
+PREPROCESSOR_CONFIG_NAME = "preprocessor_config.json" # Generated by export_onnx.py
+LLM_MODEL_PATH = os.path.join(MODEL_DIR, LLM_MODEL_NAME)
+VISION_ENCODER_PATH = os.path.join(MODEL_DIR, VISION_ENCODER_ONNX_NAME)
+MM_PROJECTOR_PATH = os.path.join(MODEL_DIR, MM_PROJECTOR_ONNX_NAME)
+PREPROCESSOR_CONFIG_PATH = os.path.join(MODEL_DIR, PREPROCESSOR_CONFIG_NAME)
+IMAGE_PATH = "test.jpg" # Replace with your test image
+# user_prompt = "Describe this image in detail."
+user_prompt = "仔细描述一下这张图片。"
+# Global RKLLMRuntime instance
+rk_runtime = None
+# Exit on Ctrl-C
+def signal_handler(signal, frame):
+    print("Ctrl-C pressed, exiting...")
+    global rk_runtime
+    if rk_runtime:
+        try:
+            print("Attempting to abort RKLLM task...")
+            rk_runtime.abort()
+            print("RKLLM task aborted.")
+        except RuntimeError as e:
+            print(f"Note: RKLLM abort failed or task was not running: {e}")
+        except Exception as e:
+            print(f"Unexpected error during RKLLM abort in signal handler: {e}")
+        try:
+            print("Attempting to destroy RKLLM instance...")
+            rk_runtime.destroy()
+            print("RKLLM instance destroyed via signal handler.")
+        except RuntimeError as e:
+            print(f"Error during RKLLM destroy in signal handler: {e}")
+        except Exception as e: # Catch any other unexpected errors
+            print(f"Unexpected error during RKLLM destroy in signal handler: {e}")
+    exit(0)
+signal.signal(signal.SIGINT, signal_handler)
+# Set RKLLM log level if desired
+os.environ["RKLLM_LOG_LEVEL"] = "1"
+inference_count = 0
+inference_start_time = 0
+first_token_received = False
+def result_callback(result_ptr, userdata, state_enum):
+    global inference_start_time, inference_count, first_token_received
+    state = LLMCallState(state_enum) # Convert int to enum
+    if result_ptr is None:
+        return
+    result = result_ptr.contents    # Dereference the pointer
+    if state == LLMCallState.RKLLM_RUN_NORMAL:
+        if not first_token_received:
+            first_token_time = time.time()
+            print(f"\nTime to first token: {first_token_time - inference_start_time:.2f} seconds")
+            first_token_received = True
+        current_text = ""
+        if result.text: # Check if char_p is not NULL
+             current_text = result.text.decode('utf-8', errors='ignore')
+        print(current_text, end="", flush=True)
+        inference_count += 1
+    elif state == LLMCallState.RKLLM_RUN_FINISH:
+        print("\n\n(finished)")
+    elif state == LLMCallState.RKLLM_RUN_ERROR:
+        print("\nError occurred during LLM call")
+    # Add other states if needed, e.g., RKLLM_RUN_WAITING
+def load_and_preprocess_image(image_path, config_path):
+    img_size = 1024
+    image_mean = [0.0, 0.0, 0.0]
+    image_std = [1.0, 1.0, 1.0]
+    print(f"Target image size from config: {img_size}x{img_size}")
+    print(f"Using image_mean: {image_mean}, image_std: {image_std}")
+    img = cv2.imread(image_path)
+    if img is None:
+        raise FileNotFoundError(f"Image not found: {image_path}")
+    # 计算缩放比例，保持宽高比
+    h, w = img.shape[:2]
+    scale = min(img_size / w, img_size / h)
+    new_w, new_h = int(w * scale), int(h * scale)
+    # 保持比例缩放
+    img_resized = cv2.resize(img, (new_w, new_h))
+    # 创建目标大小的黑色背景
+    img_padded = np.zeros((img_size, img_size, 3), dtype=np.uint8)
+    # 将缩放后的图像放在中心位置
+    y_offset = (img_size - new_h) // 2
+    x_offset = (img_size - new_w) // 2
+    img_padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = img_resized
+    img_rgb = cv2.cvtColor(img_padded, cv2.COLOR_BGR2RGB)
+    img_fp32 = img_rgb.astype(np.float32)
+    # Normalize
+    img_normalized = (img_fp32 / 255.0 - image_mean) / image_std
+    # Transpose to NCHW format
+    img_nchw = img_normalized.transpose(2, 0, 1)  # HWC to CHW
+    img_batch = img_nchw[np.newaxis, :, :, :]     # Add batch dimension -> NCHW
+    return img_batch.astype(np.float32), img_size
+def main():
+    global rk_runtime, inference_start_time, inference_count, first_token_received, user_prompt
+    # --- 1. Initialize ONNX Runtime for Vision Models ---
+    print("Loading ONNX vision encoder model...")
+    vision_session = ort.InferenceSession(VISION_ENCODER_PATH)
+    vision_input_name = vision_session.get_inputs()[0].name
+    vision_output_name = vision_session.get_outputs()[0].name
+    print(f"ONNX vision encoder loaded. Input: '{vision_input_name}', Output: '{vision_output_name}'")
+    print("Loading ONNX mm_projector model...")
+    mm_projector_session = ort.InferenceSession(MM_PROJECTOR_PATH)
+    mm_projector_input_name = mm_projector_session.get_inputs()[0].name
+    mm_projector_output_name = mm_projector_session.get_outputs()[0].name
+    print(f"ONNX mm_projector loaded. Input: '{mm_projector_input_name}', Output: '{mm_projector_output_name}'")
+    # --- 2. Initialize RKLLM ---
+    print("Initializing RKLLM...")
+    rk_runtime = RKLLMRuntime()
+    param = rk_runtime.create_default_param()
+    param.model_path = LLM_MODEL_PATH.encode('utf-8')
+    param.img_start = "<image>".encode('utf-8')
+    param.img_end = "".encode('utf-8')
+    param.img_content = "<unk>".encode('utf-8')
+    extend_param = RKLLMExtendParam()
+    extend_param.base_domain_id = 1
+    extend_param.embed_flash = 1
+    extend_param.enabled_cpus_num = 8
+    extend_param.enabled_cpus_mask = 0xffffffff
+    param.extend_param = extend_param
+    model_size_llm = os.path.getsize(LLM_MODEL_PATH)
+    print(f"Start loading language model (size: {model_size_llm / 1024 / 1024:.2f} MB)")
+    start_time_llm_load = time.time()
+    try:
+        rk_runtime.init(param, result_callback)
+    except RuntimeError as e:
+        print(f"RKLLM init failed: {e}")
+        if rk_runtime:
+            try:
+                rk_runtime.destroy()
+            except Exception as e_destroy:
+                print(f"Error destroying RKLLM after init failure: {e_destroy}")
+        return
+    end_time_llm_load = time.time()
+    print(f"Language model loaded in {end_time_llm_load - start_time_llm_load:.2f} seconds")
+    # --- 3. Load and Preprocess Image ---
+    print(f"Loading and preprocessing image: {IMAGE_PATH}")
+    preprocessed_image, original_img_dim = load_and_preprocess_image(IMAGE_PATH, PREPROCESSOR_CONFIG_PATH)
+    print(f"Input image shape for ONNX vision model: {preprocessed_image.shape}")
+    # --- 4. Vision Encoder Inference (ONNX) ---
+    start_time_vision = time.time()
+    vision_outputs = vision_session.run([vision_output_name], {vision_input_name: preprocessed_image})
+    image_features_from_vision = vision_outputs[0]
+    end_time_vision = time.time()
+    print(f"ONNX Vision encoder inference time: {end_time_vision - start_time_vision:.2f} seconds")
+    print(f"Vision encoder output shape: {image_features_from_vision.shape}")
+    # --- 5. MM Projector Inference (ONNX) ---
+    start_time_projector = time.time()
+    projector_outputs = mm_projector_session.run([mm_projector_output_name], {mm_projector_input_name: image_features_from_vision})
+    projected_image_embeddings_np = projector_outputs[0]
+    end_time_projector = time.time()
+    print(f"ONNX MM projector inference time: {end_time_projector - start_time_projector:.2f} seconds")
+    print(f"Projected image embeddings shape: {projected_image_embeddings_np.shape}")
+    # Ensure C-contiguous and float32 for ctypes
+    projected_image_embeddings_np = np.ascontiguousarray(projected_image_embeddings_np, dtype=np.float32)
+    # --- 6. Prepare Prompt and RKLLMInput ---
+    # The prompt should contain the <image> placeholder where the image features will be inserted.
+#     prompt = f"""<|im_start|>system
+# You are a helpful assistant.<|im_end|>
+# <|im_start|>user
+# {param.img_start.decode()}
+# {user_prompt}<|im_end|>
+# <|im_start|>assistant
+# """
+    # RKLLM now loads its own chat template, so we don't need to include that.
+    prompt = f"""{param.img_start.decode()}
+{user_prompt}"""
+    print(f"\nUsing prompt:\n{prompt}")
+    rkllm_input = RKLLMInput()
+    rkllm_input.input_type = RKLLMInputType.RKLLM_INPUT_MULTIMODAL
+    multimodal_payload = RKLLMMultiModelInput()
+    multimodal_payload.prompt = prompt.encode('utf-8')
+    # projected_image_embeddings_np has shape (1, num_tokens, hidden_dim)
+    num_image_tokens = projected_image_embeddings_np.shape[1]
+    # The C API expects a flat pointer to the embedding data.
+    embedding_data_flat = projected_image_embeddings_np.flatten()
+    multimodal_payload.image_embed = embedding_data_flat.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+    multimodal_payload.n_image_tokens = num_image_tokens
+    multimodal_payload.n_image = 1 # Number of images processed
+    multimodal_payload.image_width = original_img_dim # Width of the (resized before processing) image
+    multimodal_payload.image_height = original_img_dim # Height of the (resized before processing) image
+    rkllm_input._union_data.multimodal_input = multimodal_payload
+    # --- 7. Create Inference Parameters ---
+    infer_param = RKLLMInferParam()
+    infer_param.mode = RKLLMInferMode.RKLLM_INFER_GENERATE.value # Ensure this is an int for C API
+    # infer_param.keep_history = 1 # Or 0, default is usually 0 (false) in create_default_param or C struct.
+                                   # Check rkllm.h or binding for default if not setting explicitly.
+                                   # RKLLMInferParam from binding has keep_history as c_int.
+    # --- 8. Run RKLLM Inference ---
+    print("Starting RKLLM inference...")
+    inference_start_time = time.time()
+    inference_count = 0
+    first_token_received = False
+    try:
+        # The RKLLMRuntime.run method takes input and infer_param objects directly.
+        rk_runtime.run(rkllm_input, infer_param, None) # Userdata is None
+    except RuntimeError as e:
+        print(f"RKLLM run failed: {e}")
+    # --- 9. Clean up ---
+    # Normal cleanup if not interrupted by Ctrl-C.
+    # The signal handler also attempts to destroy the instance.
+    if rk_runtime and rk_runtime.llm_handle and rk_runtime.llm_handle.value:
+        try:
+            rk_runtime.destroy()
+            print("RKLLM instance destroyed at script end.")
+        except RuntimeError as e:
+            print(f"Error during RKLLM destroy at script end: {e}")
+        except Exception as e:
+             print(f"Unexpected error during RKLLM destroy at script end: {e}")
+    print("Script finished.")
+if __name__ == "__main__":
+    # rk_runtime (global) will be initialized inside main()
+    main()

test.jpg ADDED Viewed

Git LFS Details

SHA256: 6e0ba9e46ff16d0583aa286130a7275c35d05d9600c8cea8a4df4b2e0c46c27b
Pointer size: 131 Bytes
Size of remote file: 309 kB

ztu_somemodelruntime_rknnlite2.py ADDED Viewed

	@@ -0,0 +1,569 @@

+# 模块级常量和函数
+from rknnlite.api import RKNNLite
+import numpy as np
+import os
+import warnings
+import logging
+from typing import List, Dict, Union, Optional
+try:
+    import onnxruntime as ort
+    HAS_ORT = True
+except ImportError:
+    HAS_ORT = False
+    warnings.warn("onnxruntime未安装,只能使用RKNN后端", ImportWarning)
+# 配置日志
+logger = logging.getLogger("somemodelruntime_rknnlite2")
+logger.setLevel(logging.ERROR)  # 默认只输出错误信息
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    logger.addHandler(handler)
+# ONNX Runtime日志级别到Python logging级别的映射
+_LOGGING_LEVEL_MAP = {
+    0: logging.DEBUG,    # Verbose
+    1: logging.INFO,     # Info
+    2: logging.WARNING,  # Warning
+    3: logging.ERROR,    # Error
+    4: logging.CRITICAL  # Fatal
+}
+# 检查环境变量中的日志级别设置
+try:
+    env_log_level = os.getenv('ZTU_MODELRT_RKNNL2_LOG_LEVEL')
+    if env_log_level is not None:
+        log_level = int(env_log_level)
+        if log_level in _LOGGING_LEVEL_MAP:
+            logger.setLevel(_LOGGING_LEVEL_MAP[log_level])
+            logger.info(f"从环境变量设置日志级别: {log_level}")
+        else:
+            logger.warning(f"环境变量ZTU_MODELRT_RKNNL2_LOG_LEVEL的值无效: {log_level}, 应该是0-4之间的整数")
+except ValueError:
+    logger.warning(f"环境变量ZTU_MODELRT_RKNNL2_LOG_LEVEL的值无效: {env_log_level}, 应该是0-4之间的整数")
+def set_default_logger_severity(level: int) -> None:
+    """
+    Sets the default logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal
+    Args:
+        level: 日志级别(0-4)
+    """
+    if level not in _LOGGING_LEVEL_MAP:
+        raise ValueError(f"无效的日志级别: {level}, 应该是0-4之间的整数")
+    logger.setLevel(_LOGGING_LEVEL_MAP[level])
+def set_default_logger_verbosity(level: int) -> None:
+    """
+    Sets the default logging verbosity level. To activate the verbose log,
+    you need to set the default logging severity to 0:Verbose level.
+    Args:
+        level: 日志级别(0-4)
+    """
+    set_default_logger_severity(level)
+# RKNN tensor type到numpy dtype的映射
+RKNN_DTYPE_MAP = {
+    0: np.float32,  # RKNN_TENSOR_FLOAT32
+    1: np.float16,  # RKNN_TENSOR_FLOAT16
+    2: np.int8,     # RKNN_TENSOR_INT8
+    3: np.uint8,    # RKNN_TENSOR_UINT8
+    4: np.int16,    # RKNN_TENSOR_INT16
+    5: np.uint16,   # RKNN_TENSOR_UINT16
+    6: np.int32,    # RKNN_TENSOR_INT32
+    7: np.uint32,   # RKNN_TENSOR_UINT32
+    8: np.int64,    # RKNN_TENSOR_INT64
+    9: bool,        # RKNN_TENSOR_BOOL
+    10: np.int8,    # RKNN_TENSOR_INT4 (用int8表示)
+}
+def get_available_providers() -> List[str]:
+    """
+    获取可用的设备提供者列表(为保持接口兼容性的占位函数)
+    Returns:
+        list: 可用的设备提供者列表,总是返回["CPUExecutionProvider", "somemodelruntime_rknnlite2_ExecutionProvider"]
+    """
+    return ["CPUExecutionProvider", "somemodelruntime_rknnlite2_ExecutionProvider"]
+def get_device() -> str:
+    """
+    获取当前设备
+    Returns:
+        str: 当前设备
+    """
+    return "RKNN2"
+def get_version_info() -> Dict[str, str]:
+    """
+    获取版本信息
+    Returns:
+        dict: 包含API和驱动版本信息的字典
+    """
+    runtime = RKNNLite()
+    version = runtime.get_sdk_version()
+    return {
+        "api_version": version.split('\n')[2].split(': ')[1].split(' ')[0],
+        "driver_version": version.split('\n')[3].split(': ')[1]
+    }
+class IOTensor:
+    """输入/输出张量的信息封装类"""
+    def __init__(self, name, shape, type=None):
+        self.name = name.decode() if isinstance(name, bytes) else name
+        self.shape = shape
+        self.type = type
+    def __str__(self):
+        return f"IOTensor(name='{self.name}', shape={self.shape}, type={self.type})"
+class SessionOptions:
+    """会话选项类"""
+    def __init__(self):
+        self.enable_profiling = False  # 是否使用性能分析
+        self.intra_op_num_threads = 1  # 设置RKNN的线程数, 对应rknn的core_mask
+        self.log_severity_level = -1 # 另一个设置日志级别的参数
+        self.log_verbosity_level = -1 # 另一个设置日志级别的参数
+class InferenceSession:
+    """
+    RKNNLite运行时封装类,API风格类似ONNX Runtime
+    """
+    def __new__(cls, model_path: str, sess_options: Optional[SessionOptions] = None, **kwargs):
+        processed_path = InferenceSession._process_model_path(model_path, sess_options)
+        if isinstance(processed_path, str) and processed_path.lower().endswith('.onnx'):
+            logger.info("使用ONNX Runtime加载模型")
+            if not HAS_ORT:
+                raise RuntimeError("未安装onnxruntime,无法加载ONNX模型")
+            return ort.InferenceSession(processed_path, sess_options=sess_options, **kwargs)
+        else:
+            # 如果不是 ONNX 模型，则调用父类的 __new__ 创建 InferenceSession 实例
+            instance = super().__new__(cls)
+            # 保存处理后的路径
+            instance._processed_path = processed_path
+            return instance
+    def __init__(self, model_path: str, sess_options: Optional[SessionOptions] = None, **kwargs):
+        """
+        初始化运行时并加载模型
+        Args:
+            model_path: 模型文件路径(.rknn或.onnx)
+            sess_options: 会话选项
+            **kwargs: 其他初始化参数
+        """
+        options = sess_options or SessionOptions()
+        # 只在未设置环境变量时使用SessionOptions中的日志级别
+        if os.getenv('ZTU_MODELRT_RKNNL2_LOG_LEVEL') is None:
+            if options.log_severity_level != -1:
+                set_default_logger_severity(options.log_severity_level)
+            if options.log_verbosity_level != -1:
+                set_default_logger_verbosity(options.log_verbosity_level)
+        # 使用__new__中处理好的路径
+        model_path = getattr(self, '_processed_path', model_path)
+        if isinstance(model_path, str) and model_path.lower().endswith('.onnx'):
+            # 避免重复加载 ONNX 模型
+            return
+        # ... 现有的 RKNN 模型加载和初始化代码 ...
+        self.model_path = model_path
+        if not os.path.exists(self.model_path):
+            logger.error(f"模型文件不存在: {self.model_path}")
+            raise FileNotFoundError(f"模型文件不存在: {self.model_path}")
+        self.runtime = RKNNLite(verbose=options.enable_profiling)
+        logger.debug(f"正在加载模型: {self.model_path}")
+        ret = self.runtime.load_rknn(self.model_path)
+        if ret != 0:
+            logger.error(f"加载RKNN模型失败: {self.model_path}")
+            raise RuntimeError(f'加载RKNN模型失败: {self.model_path}')
+        logger.debug("模型加载成功")
+        if options.intra_op_num_threads == 1:
+            core_mask = RKNNLite.NPU_CORE_AUTO
+        elif options.intra_op_num_threads == 2:
+            core_mask = RKNNLite.NPU_CORE_0_1
+        elif options.intra_op_num_threads == 3:
+            core_mask = RKNNLite.NPU_CORE_0_1_2
+        else:
+            raise ValueError(f"intra_op_num_threads的值无效: {options.intra_op_num_threads}, 只能是1,2或3")
+        logger.debug("正在初始化运行时环境")
+        ret = self.runtime.init_runtime(core_mask=core_mask)
+        if ret != 0:
+            logger.error("初始化运行时环境失败")
+            raise RuntimeError('初始化运行时环境失败')
+        logger.debug("运行时环境初始化成功")
+        self._init_io_info()
+        self.options = options
+    def get_performance_info(self) -> Dict[str, float]:
+        """
+        获取性能信息
+        Returns:
+            dict: 包含性能信息的字典
+        """
+        if not self.options.perf_debug:
+            raise RuntimeError("性能分析未启用,请在SessionOptions中设置perf_debug=True")
+        perf = self.runtime.rknn_runtime.get_run_perf()
+        return {
+            "run_duration": perf.run_duration / 1000.0  # 转换为毫秒
+        }
+    def set_core_mask(self, core_mask: int) -> None:
+        """
+        设置NPU核心使用模式
+        Args:
+            core_mask: NPU核心掩码,使用NPU_CORE_*常量
+        """
+        ret = self.runtime.rknn_runtime.set_core_mask(core_mask)
+        if ret != 0:
+            raise RuntimeError("设置NPU核心模式失败")
+    @staticmethod
+    def _process_model_path(model_path, sess_options):
+        """
+        处理模型路径,支持.onnx和.rknn文件
+        Args:
+            model_path: 模型文件路径
+        """
+        # 如果是ONNX文件,检查是否需要自动加载RKNN
+        if model_path.lower().endswith('.onnx'):
+            logger.info("检测到ONNX模型文件")
+            # 获取需要跳过自动加载的模型列表
+            skip_models = os.getenv('ZTU_MODELRT_RKNNL2_SKIP', '').strip()
+            if skip_models:
+                skip_list = [m.strip() for m in skip_models.split(',')]
+                # 获取模型文件名(不含路径)用于匹配
+                model_name = os.path.basename(model_path)
+                if model_name.lower() in [m.lower() for m in skip_list]:
+                    logger.info(f"模型{model_name}在跳过列表中,将使用ONNX Runtime")
+                    return model_path
+            # 构造RKNN文件路径
+            rknn_path = os.path.splitext(model_path)[0] + '.rknn'
+            if os.path.exists(rknn_path):
+                logger.info(f"找到对应的RKNN模型,将使用RKNN: {rknn_path}")
+                return rknn_path
+            else:
+                logger.info("未找到对应的RKNN模型,将使用ONNX Runtime")
+                return model_path
+        return model_path
+    def _convert_nhwc_to_nchw(self, shape):
+        """将NHWC格式的shape转换为NCHW格式"""
+        if len(shape) == 4:
+            # NHWC -> NCHW
+            n, h, w, c = shape
+            return [n, c, h, w]
+        return shape
+    def _init_io_info(self):
+        """初始化模型的输入输出信息"""
+        runtime = self.runtime.rknn_runtime
+        # 获取输入输出数量
+        n_input, n_output = runtime.get_in_out_num()
+        # 获取输入信息
+        self.input_tensors = []
+        for i in range(n_input):
+            attr = runtime.get_tensor_attr(i)
+            shape = [attr.dims[j] for j in range(attr.n_dims)]
+            # 对四维输入进行NHWC到NCHW的转换
+            shape = self._convert_nhwc_to_nchw(shape)
+            # 获取dtype
+            dtype = RKNN_DTYPE_MAP.get(attr.type, None)
+            tensor = IOTensor(attr.name, shape, dtype)
+            self.input_tensors.append(tensor)
+        # 获取输出信息
+        self.output_tensors = []
+        for i in range(n_output):
+            attr = runtime.get_tensor_attr(i, is_output=True)
+            shape = runtime.get_output_shape(i)
+            # 获取dtype
+            dtype = RKNN_DTYPE_MAP.get(attr.type, None)
+            tensor = IOTensor(attr.name, shape, dtype)
+            self.output_tensors.append(tensor)
+    def get_inputs(self):
+        """
+        获取模型输入信息
+        Returns:
+            list: 包含输入信息的列表
+        """
+        return self.input_tensors
+    def get_outputs(self):
+        """
+        获取模型输出信息
+        Returns:
+            list: 包含输出信息的列表
+        """
+        return self.output_tensors
+    def run(self, output_names=None, input_feed=None, data_format="nchw", **kwargs):
+        """
+        执行模型推理
+        Args:
+            output_names: 输出节点名称列表,指定需要返回哪些输出
+            input_feed: 输入数据字典或列表
+            data_format: 输入数据格式,"nchw"或"nhwc"
+            **kwargs: 其他运行时参数
+        Returns:
+            list: 模型输出结果列表,如果指定了output_names则只返回指定的输出
+        """
+        if input_feed is None:
+            logger.error("input_feed不能为None")
+            raise ValueError("input_feed不能为None")
+        # 准备输入数据
+        if isinstance(input_feed, dict):
+            # 如果是字典,按照模型输入顺序排列
+            inputs = []
+            input_map = {tensor.name: i for i, tensor in enumerate(self.input_tensors)}
+            for tensor in self.input_tensors:
+                if tensor.name not in input_feed:
+                    raise ValueError(f"缺少输入: {tensor.name}")
+                inputs.append(input_feed[tensor.name])
+        elif isinstance(input_feed, (list, tuple)):
+            # 如果是列表,确保长度匹配
+            if len(input_feed) != len(self.input_tensors):
+                raise ValueError(f"输入数量不匹配: 期望{len(self.input_tensors)}, 实际{len(input_feed)}")
+            inputs = list(input_feed)
+        else:
+            logger.error("input_feed必须是字典或列表类型")
+            raise ValueError("input_feed必须是字典或列表类型")
+        # 执行推理
+        try:
+            logger.debug("开始执行推理")
+            all_outputs = self.runtime.inference(inputs=inputs, data_format=data_format)
+            # 如果没有指定output_names,返回所有输出
+            if output_names is None:
+                return all_outputs
+            # 获取指定的输出
+            output_map = {tensor.name: i for i, tensor in enumerate(self.output_tensors)}
+            selected_outputs = []
+            for name in output_names:
+                if name not in output_map:
+                    raise ValueError(f"未找到输出节点: {name}")
+                selected_outputs.append(all_outputs[output_map[name]])
+            return selected_outputs
+        except Exception as e:
+            logger.error(f"推理执行失败: {str(e)}")
+            raise RuntimeError(f"推理执行失败: {str(e)}")
+    def close(self):
+        """
+        关闭会话,释放资源
+        """
+        if self.runtime is not None:
+            logger.info("正在释放运行时资源")
+            self.runtime.release()
+            self.runtime = None
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def end_profiling(self) -> Optional[str]:
+        """
+        结束性能分析的存根方法
+        Returns:
+            Optional[str]: None
+        """
+        warnings.warn("end_profiling()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return None
+    def get_profiling_start_time_ns(self) -> int:
+        """
+        获取性能分析开始时间的存根方法
+        Returns:
+            int: 0
+        """
+        warnings.warn("get_profiling_start_time_ns()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return 0
+    def get_modelmeta(self) -> Dict[str, str]:
+        """
+        获取模型元数据的存根方法
+        Returns:
+            Dict[str, str]: 空字典
+        """
+        warnings.warn("get_modelmeta()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return {}
+    def get_session_options(self) -> SessionOptions:
+        """
+        获取会话选项
+        Returns:
+            SessionOptions: 当前会话选项
+        """
+        return self.options
+    def get_providers(self) -> List[str]:
+        """
+        获取当前使用的providers的存根方法
+        Returns:
+            List[str]: ["CPUExecutionProvider"]
+        """
+        warnings.warn("get_providers()是存根方法,始终返回CPUExecutionProvider", RuntimeWarning, stacklevel=2)
+        return ["CPUExecutionProvider"]
+    def get_provider_options(self) -> Dict[str, Dict[str, str]]:
+        """
+        获取provider选项的存根方法
+        Returns:
+            Dict[str, Dict[str, str]]: 空字典
+        """
+        warnings.warn("get_provider_options()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return {}
+    def get_session_config(self) -> Dict[str, str]:
+        """
+        获取会话配置的存根方法
+        Returns:
+            Dict[str, str]: 空字典
+        """
+        warnings.warn("get_session_config()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return {}
+    def get_session_state(self) -> Dict[str, str]:
+        """
+        获取会话状态的存根方法
+        Returns:
+            Dict[str, str]: 空字典
+        """
+        warnings.warn("get_session_state()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return {}
+    def set_session_config(self, config: Dict[str, str]) -> None:
+        """
+        设置会话配置的存根方法
+        Args:
+            config: 会话配置字典
+        """
+        warnings.warn("set_session_config()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+    def get_memory_info(self) -> Dict[str, int]:
+        """
+        获取内存使用信息的存根方法
+        Returns:
+            Dict[str, int]: 空字典
+        """
+        warnings.warn("get_memory_info()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return {}
+    def set_memory_pattern(self, enable: bool) -> None:
+        """
+        设置内存模式的存根方法
+        Args:
+            enable: 是否启用内存模式
+        """
+        warnings.warn("set_memory_pattern()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+    def disable_memory_pattern(self) -> None:
+        """
+        禁用内存模式的存根方法
+        """
+        warnings.warn("disable_memory_pattern()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+    def get_optimization_level(self) -> int:
+        """
+        获取优化级别的存根方法
+        Returns:
+            int: 0
+        """
+        warnings.warn("get_optimization_level()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return 0
+    def set_optimization_level(self, level: int) -> None:
+        """
+        设置优化级别的存根方法
+        Args:
+            level: 优化级别
+        """
+        warnings.warn("set_optimization_level()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+    def get_model_metadata(self) -> Dict[str, str]:
+        """
+        获取模型元数据的存根方法(与get_modelmeta不同的接口)
+        Returns:
+            Dict[str, str]: 空字典
+        """
+        warnings.warn("get_model_metadata()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return {}
+    def get_model_path(self) -> str:
+        """
+        获取模型路径
+        Returns:
+            str: 模型文件路径
+        """
+        return self.model_path
+    def get_input_type_info(self) -> List[Dict[str, str]]:
+        """
+        获取输入类型信息的存根方法
+        Returns:
+            List[Dict[str, str]]: 空列表
+        """
+        warnings.warn("get_input_type_info()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return []
+    def get_output_type_info(self) -> List[Dict[str, str]]:
+        """
+        获取输出类型信息的存根方法
+        Returns:
+            List[Dict[str, str]]: 空列表
+        """
+        warnings.warn("get_output_type_info()是存根方法,不提供实际功能", RuntimeWarning, stacklevel=2)
+        return []