This repository contains the ONNX (Open Neural Network Exchange) version of the powerful BAAI/bge-code-v1 model, optimized for high-performance inference.This model is ideal for generating embeddings for code snippets and can be used in a variety of environments thanks to the ONNX Runtime.Original ModelThis model is a conversion of BAAI/bge-code-v1. All credit for the training and architecture goes to the original authors at the Beijing Academy of Artificial Intelligence (BAAI).How to UseBelow are examples of how to use this ONNX model in Python, Rust, and with Docker for a production-ready API.Usage with PythonYou can use the onnxruntime and huggingface_hub libraries to easily download and run this model.1. Install Dependencies:pip install onnxruntime huggingface_hub tokenizers 2. Python Code:import numpy as np import onnxruntime as ort from huggingface_hub import snapshot_download from tokenizers import Tokenizer import os

class BgeCodeOnnx: def init(self, repo_id="babybirdprd/bge-code-v1-onnx"): # Download all files from the Hub and get the local directory path snapshot_dir = snapshot_download(repo_id=repo_id)

    # Load the tokenizer and the ONNX session
    model_path = os.path.join(snapshot_dir, "model.onnx")
    tokenizer_path = os.path.join(snapshot_dir, "tokenizer.json")
    
    self.tokenizer = Tokenizer.from_file(tokenizer_path)
    self.session = ort.InferenceSession(model_path)
    
    # Get the expected input names from the model
    self.input_names = [inp.name for inp in self.session.get_inputs()]
    print(f"Model initialized. Expects inputs: {self.input_names}")

def embed(self, sentences):
    # Tokenize the input sentences
    self.tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=512)
    encoded_input = self.tokenizer.encode_batch(sentences)
    
    # Build the input dictionary for ONNX Runtime
    ort_inputs = {}
    if "input_ids" in self.input_names:
        ort_inputs['input_ids'] = np.array([e.ids for e in encoded_input], dtype=np.int64)
    if "attention_mask" in self.input_names:
        ort_inputs['attention_mask'] = np.array([e.attention_mask for e in encoded_input], dtype=np.int64)
    
    # Run inference
    ort_outputs = self.session.run(None, ort_inputs)
    last_hidden_state = ort_outputs[0]
    
    # Perform pooling (get the [CLS] token embedding)
    pooled_embeddings = last_hidden_state[:, 0]
    
    # Normalize the embeddings
    norms = np.linalg.norm(pooled_embeddings, axis=1, keepdims=True)
    normalized_embeddings = pooled_embeddings / norms
    
    return normalized_embeddings

--- Example Usage ---

if name == 'main': model = BgeCodeOnnx()

code_snippets = [
    "fn main() { let x = 5; }",
    "struct User { id: u64, name: String }"
]

embeddings = model.embed(code_snippets)

print("\nEmbeddings generated successfully!")
for i, snippet in enumerate(code_snippets):
    print(f"\nInput: '{snippet}'")
    print(f"Embedding (first 5 dims): {embeddings[i][:5]}")

Usage with RustYou can use the ort crate in Rust to run this model natively for high performance.1. Add Dependencies to Cargo.toml:[dependencies] anyhow = "1.0" ndarray = "0.15" ort = "2.0.0" tokenizers = "0.19.1" hf-hub = "0.3.3" tokio = { version = "1", features = ["full"] } camino = "1.1.7" 2. Rust Code (src/main.rs):use anyhow::Result; use camino::Utf8Path; use hf_hub::api::sync::Api; use ndarray::{s, Array2, ArrayView2, Axis}; use ort::{Environment, Session, SessionBuilder, Value}; use tokenizers::Tokenizer;

fn normalize_l2(x: ArrayView2) -> Array2 { let norms = x.map_axis(Axis(1), |row| row.dot(&row).sqrt()); x / &norms.into_shape((norms.len(), 1)).unwrap() }

#[tokio::main] async fn main() -> Result<()> { let environment = Environment::builder().with_name("bge-code-test").build()?;

println!("Downloading model files...");
let api = Api::new()?;
let repo = api.repo(hf_hub::Repo::model(
    "babybirdprd/bge-code-v1-onnx".to_string(),
));
let model_dir = repo.path();
let model_path = Utf8Path::from_path(model_dir.join("model.onnx").as_path()).unwrap();
let tokenizer_path = Utf8Path::from_path(model_dir.join("tokenizer.json").as_path()).unwrap();
println!("✅ Model downloaded to: {}", model_dir.display());

println!("\nLoading tokenizer and creating ORT session...");
let mut tokenizer = Tokenizer::from_file(tokenizer_path).unwrap();
let session = SessionBuilder::new(&environment)?.with_model_from_file(model_path)?;
println!("✅ Session created successfully.");

let sentences = vec![
    "fn main() { let x = 5; }",
    "struct User { id: u64, name: String }",
];

let padding_params = tokenizers::PaddingParams {
    strategy: tokenizers::PaddingStrategy::BatchLongest,
    ..Default::default()
};
tokenizer.with_padding(Some(padding_params));
let tokenized_input = tokenizer.encode_batch(sentences.clone(), true).unwrap();

let input_ids: Vec<i64> = tokenized_input.iter().flat_map(|enc| enc.get_ids().iter().map(|&id| id as i64)).collect();
let attention_mask: Vec<i64> = tokenized_input.iter().flat_map(|enc| enc.get_attention_mask().iter().map(|&id| id as i64)).collect();

let batch_size = sentences.len();
let sequence_length = tokenized_input[0].get_ids().len();

println!("\nRunning inference...");
// This model only requires 'input_ids' and 'attention_mask'
let outputs = session.run(vec![
    Value::from_array(Array2::from_shape_vec((batch_size, sequence_length), input_ids)?.view())?,
    Value::from_array(Array2::from_shape_vec((batch_size, sequence_length), attention_mask)?.view())?,
])?;

println!("Post-processing embeddings...");
let last_hidden_state = outputs[0].try_extract_tensor::<f32>()?;
let (batch_size, _seq_len, hidden_dim) = last_hidden_state.dims();
let pooled_embeddings = last_hidden_state.slice(s![.., 0, ..]).into_shape((batch_size, hidden_dim)).unwrap().to_owned();
let normalized_embeddings = normalize_l2(pooled_embeddings.view());

println!("\n🎉 Rust ORT Test Complete! 🎉");
for (i, sentence) in sentences.iter().enumerate() {
    let embedding_slice = normalized_embeddings.slice(s![i, 0..5]).to_vec();
    println!("\nInput: '{}'", sentence);
    println!("Embedding (first 5 dims): {:?}", embedding_slice);
}
Ok(())

} Usage with Docker (text-embeddings-inference)You can serve this model as a high-throughput, OpenAI-compatible API using the text-embeddings-inference container.1. Run the Docker Container:The command below will download the Docker image and your model, then start the server.# Define a volume to store the model export MODEL_DIR=$HOME/bge-code-onnx-model

Run the container, mounting the volume

docker run --pull always -p 8080:80 -v $MODEL_DIR:/data
ghcr.io/huggingface/text-embeddings-inference:main-onnx
--model-id babybirdprd/bge-code-v1-onnx 2. Call the API:Once the server is running, you can call it using any HTTP client.curl http://localhost:8080/embed
-X POST
-H "Content-Type: application/json"
-d '{"inputs": "fn main() { let message = "Hello, ONNX!"; }"}'

babybirdprd
/

bge-code-v1-onnx

--- Example Usage ---

Run the container, mounting the volume

Model tree for babybirdprd/bge-code-v1-onnx