cagataydev's picture
Update README.md
3de5fe7 verified
---
language:
- en
license: mit
library_name: transformers
pipeline_tag: robotics
tags:
- gr00t
- robotics
- nvidia
- embodied-ai
- trained-model
private: true
---
# gr00t-wholettheducksout
This is a GR00T (Generalist Robot 00 Technology) model trained using NVIDIA's GR00T training framework.
## Model Details
- **Model Type**: GR00T Embodied AI Model
- **Training Job**: wholettheducksout_1to1_matched
- **Training Steps**: 200,000
- **Training Duration**: ~23.5 hours
- **Data Configuration**: so100_dualcam
- **Base Model**: nvidia/GR00T-N1.5-3B
## Training Configuration
```json
{
"action_dim": 32,
"action_head_cfg": {
"action_dim": 32,
"action_horizon": 16,
"add_pos_embed": true,
"backbone_embedding_dim": 2048,
"diffusion_model_cfg": {
"attention_head_dim": 48,
"cross_attention_dim": 2048,
"dropout": 0.2,
"final_dropout": true,
"interleave_self_attention": true,
"norm_type": "ada_norm",
"num_attention_heads": 32,
"num_layers": 16,
"output_dim": 1024,
"positional_embeddings": null
},
"hidden_size": 1024,
"input_embedding_dim": 1536,
"max_action_dim": 32,
"max_state_dim": 64,
"model_dtype": "float32",
"noise_beta_alpha": 1.5,
"noise_beta_beta": 1.0,
"noise_s": 0.999,
"num_inference_timesteps": 4,
"num_target_vision_tokens": 32,
"num_timestep_buckets": 1000,
"tune_diffusion_model": true,
"tune_projector": true,
"use_vlln": true,
"vl_self_attention_cfg": {
"attention_head_dim": 64,
"dropout": 0.2,
"final_dropout": true,
"num_attention_heads": 32,
"num_layers": 4,
"positional_embeddings": null
}
},
"action_horizon": 16,
"architectures": [
"GR00T_N1_5"
],
"attn_implementation": null,
"backbone_cfg": {
"eagle_path": "NVEagle/eagle_er-qwen3_1_7B-Siglip2_400M_stage1_5_128gpu_er_v7_1mlp_nops",
"load_bf16": false,
"project_to_dim": null,
"reproject_vision": false,
"select_layer": 12,
"tune_llm": false,
"tune_visual": true,
"use_flash_attention": true
},
"compute_dtype": "bfloat16",
"hidden_size": 2048,
"model_dtype": "float32",
"model_type": "gr00t_n1_5",
"torch_dtype": "bfloat16",
"transformers_version": "4.51.3"
}
```
## Usage
This model can be used with the GR00T inference framework:
```python
# Example usage (adjust based on your specific setup)
from gr00t_inference import GR00TInference
model = GR00TInference(
model_path="path/to/this/model",
embodiment_tag="new_embodiment",
data_config="so100"
)
# Use for inference
results = model.infer(your_input_data)
```
## Training Metadata
{
"new_embodiment": {
"statistics": {
"state": {
"single_arm": {
"max": [
72.46653747558594,
62.818336486816406,
99.72752380371094,
99.39103698730469,
-46.26399230957031
],
"min": [
-86.99808502197266,
-99.32088470458984,
-97.72933959960938,
-87.64680480957031,
-65.0611801147461
],
"mean": [
-7.457055568695068,
-25.479028701782227,
32.967071533203125,
35.0267333984375,
-55.26940155029297
],
"std": [
20.533525466918945,
50.98550033569336,
50.28582763671875,
45.0773811340332,
2.7385220527648926
],
"q01": [
-75.78075408935547,
-99.1511001586914,
-95.18619537353516,
-62.41844177246094,
-61.2080192565918
],
"q99": [
33.20586395263672,
55.67232688903806,
99.54586791992188,
99.30404663085938,
-48.86748123168945
]
},
"gripper": {
"max": [
49.49358367919922
],
"min": [
1.3504388332366943
],
"mean": [
11.123491287231445
],
"std": [
10.017578125
],
"q01": [
1.3504388332366943
],
"q99": [
40.64821243286133
]
}
},
"action": {
"single_arm": {
"max": [
73.06226348876953,
62.077701568603516,
99.81908416748047,
100.0,
-46.0078010559082
],
"min": [
-87.29351806640625,
-100.0,
-99.81908416748047,
-91.41742706298828,
-65.25357818603516
],
"mean": [
-7.188200950622559,
-26.144899368286133,
31.129091262817383,
34.6439094543457,
-55.28120803833008
],
"std": [
20.539134979248047,
50.40521240234375,
50.696495056152344,
45.221248626708984,
2.745452642440796
],
"q01": [
-75.47649383544922,
-99.49324035644531,
-96.72727142333984,
-62.808841705322266,
-61.508453369140625
],
"q99": [
33.67217254638672,
54.47635269165039,
99.63817596435547,
99.56653594970703,
-48.920677185058594
]
},
"gripper": {
"max": [
49.88161087036133
],
"min": [
0.23677979409694672
],
"mean": [
9.19546890258789
],
"std": [
10.420595169067383
],
"q01": [
1.262825608253479
],
"q99": [
40.64719772338867
]
}
}
},
"modalities": {
"video": {
"front": {
"resolution": [
640,
480
],
"channels": 3,
"fps": 30.0
},
"wrist": {
"resolution": [
640,
480
],
"channels": 3,
"fps": 30.0
}
},
"state": {
"single_arm": {
"absolute": true,
"rotation_type": null,
"shape": [
5
],
"continuous": true
},
"gripper": {
"absolute": true,
"rotation_type": null,
"shape": [
1
],
"continuous": true
}
},
"action": {
"single_arm": {
"absolute": true,
"rotation_type": null,
"shape": [
5
],
"continuous": true
},
"gripper": {
"absolute": true,
"rotation_type": null,
"shape": [
1
],
"continuous": true
}
}
},
"embodiment_tag": "new_embodiment"
}
}
## Files
- `config.json`: Model configuration
- `model-*.safetensors`: Model weights in SafeTensors format
- `model.safetensors.index.json`: Model sharding index
- `experiment_cfg/metadata.json`: Training experiment metadata
## License
This model is released under the MIT license.