diff --git "a/CalibrationExample.ipynb" "b/CalibrationExample.ipynb" new file mode 100644--- /dev/null +++ "b/CalibrationExample.ipynb" @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a31f3377", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import numpy as np\n", + "import tensorflow_datasets as tfds\n", + "from tqdm import tqdm\n", + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "import mediapy\n", + "from scipy.spatial.transform import Rotation as R" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d1aaaf", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the DROID subset dataset\n", + "ds = tfds.load(\"droid_100\", data_dir=\"gs://gresearch/robotics\", split=\"train\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ef404b22", + "metadata": {}, + "outputs": [], + "source": [ + "path_to_droid_repo = \".\" # TODO: Replace with the path to your DROID repository\n", + "\n", + "# Load the extrinsics\n", + "cam2base_extrinsics_path = f\"{path_to_droid_repo}/cam2base_extrinsics.json\"\n", + "with open(cam2base_extrinsics_path, \"r\") as f:\n", + " cam2base_extrinsics = json.load(f)\n", + "\n", + "# Load the intrinsics\n", + "intrinsics_path = f\"{path_to_droid_repo}/intrinsics.json\"\n", + "with open(intrinsics_path, \"r\") as f:\n", + " intrinsics = json.load(f)\n", + "\n", + "# Load mapping from episode ID to path, then invert\n", + "episode_id_to_path_path = f\"{path_to_droid_repo}/episode_id_to_path.json\"\n", + "with open(episode_id_to_path_path, \"r\") as f:\n", + " episode_id_to_path = json.load(f)\n", + "episode_path_to_id = {v: k for k, v in episode_id_to_path.items()}\n", + "\n", + "# Load camera serials\n", + "camera_serials_path = f\"{path_to_droid_repo}/camera_serials.json\"\n", + "with open(camera_serials_path, \"r\") as f:\n", + " camera_serials = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a1b27a95", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 25%|██▌ | 25/100 [00:02<00:06, 10.73it/s]\n" + ] + } + ], + "source": [ + "# Iterate through the dataset to find the first episode that has a cam2base extrinsics entry\n", + "eps = []\n", + "for ep in tqdm(ds):\n", + " file_path = ep[\"episode_metadata\"][\"file_path\"].numpy().decode(\"utf-8\")\n", + " recording_folderpath = ep[\"episode_metadata\"][\"recording_folderpath\"].numpy().decode(\"utf-8\")\n", + "\n", + " episode_path = file_path.split(\"r2d2-data-full/\")[1].split(\"/trajectory\")[0]\n", + " if episode_path not in episode_path_to_id:\n", + " continue\n", + " episode_id = episode_path_to_id[episode_path]\n", + " \n", + " if episode_id in cam2base_extrinsics:\n", + " eps.append(ep)\n", + " \n", + " if len(eps) >= 10:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "480b1485", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Camera with calibration data: ext1_cam_serial --> exterior_image_1_left\n" + ] + } + ], + "source": [ + "# Iterate through the extrinsics to find key that is a digit\n", + "# This is the camera serial number, and the corresponding value is the extrinsics\n", + "for k, v in cam2base_extrinsics[episode_id].items():\n", + " if k.isdigit():\n", + " camera_serial = k\n", + " extracted_extrinsics = v\n", + " break\n", + "\n", + "# Also lets us get the intrinsics\n", + "extracted_intrinsics = intrinsics[episode_id][camera_serial]\n", + "\n", + "# Using the camera serial, find the corresponding camera name (which is used to determine\n", + "# which image stream in the episode to use)\n", + "camera_serials_to_name = {v: k for k, v in camera_serials[episode_id].items()}\n", + "calib_camera_name = camera_serials_to_name[camera_serial]\n", + "\n", + "if calib_camera_name == \"ext1_cam_serial\":\n", + " calib_image_name = \"exterior_image_1_left\"\n", + "elif calib_camera_name == \"ext2_cam_serial\":\n", + " calib_image_name = \"exterior_image_2_left\"\n", + "else:\n", + " raise ValueError(f\"Unknown camera name: {calib_camera_name}\")\n", + "\n", + "print(f\"Camera with calibration data: {calib_camera_name} --> {calib_image_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "51d0468e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[-0.79138441 -0.21481151 0.57233445 0.27362273]\n", + " [-0.61099013 0.30863306 -0.72899705 0.45935 ]\n", + " [-0.02004438 -0.9266076 -0.37549511 0.50548118]\n", + " [ 0. 0. 0. 1. ]]\n" + ] + } + ], + "source": [ + "# Convert the extrinsics to a homogeneous transformation matrix\n", + "pos = extracted_extrinsics[0:3] # translation\n", + "rot_mat = R.from_euler(\"xyz\", extracted_extrinsics[3:6]).as_matrix() # rotation\n", + "\n", + "# Make homogenous transformation matrix\n", + "cam_to_base_extrinsics_matrix = np.eye(4)\n", + "cam_to_base_extrinsics_matrix[:3, :3] = rot_mat\n", + "cam_to_base_extrinsics_matrix[:3, 3] = pos\n", + "\n", + "print(cam_to_base_extrinsics_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bb7545de", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[522.96893311 0. 634.02514648]\n", + " [ 0. 522.96893311 365.70669556]\n", + " [ 0. 0. 1. ]]\n" + ] + } + ], + "source": [ + "# Convert the intrinsics to a matrix\n", + "fx, cx, fy, cy = extracted_intrinsics[\"cameraMatrix\"]\n", + "intrinsics_matrix = np.array([\n", + " [fx, 0, cx],\n", + " [0, fy, cy],\n", + " [0, 0, 1]\n", + "])\n", + "print(intrinsics_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bcc54c97", + "metadata": {}, + "outputs": [], + "source": [ + "# Save all observations for the calibrated camera and corresponding gripper positions\n", + "images = []\n", + "cartesian_poses = []\n", + "for step in ep[\"steps\"]:\n", + " image = step[\"observation\"][calib_image_name].numpy()\n", + " images.append(image)\n", + " cartesian_pose = step[\"observation\"][\"cartesian_position\"].numpy()\n", + " cartesian_poses.append(cartesian_pose)\n", + "\n", + "# length images x 6\n", + "cartesian_poses = np.array(cartesian_poses)\n", + "# Remove the rotation and make homogeneous: --> length images x 3 --> length images x 4\n", + "cartesian_homogeneous_positions = cartesian_poses[:, :3]\n", + "cartesian_homogeneous_positions = np.hstack(\n", + " (cartesian_homogeneous_positions, np.ones((cartesian_homogeneous_positions.shape[0], 1)))\n", + ")\n", + "\n", + "# Transpose to support matrix multiplication: --> 4 x length images\n", + "gripper_position_base = cartesian_homogeneous_positions.T" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "95496c61", + "metadata": {}, + "outputs": [], + "source": [ + "# Transform gripper position to camera frame, then remove homogeneous component\n", + "base_to_cam_extrinsics_matrix = np.linalg.inv(cam_to_base_extrinsics_matrix)\n", + "robot_gripper_position_cam = base_to_cam_extrinsics_matrix @ gripper_position_base\n", + "robot_gripper_position_cam = robot_gripper_position_cam[:3] # Now 3 x length images" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e2b66135", + "metadata": {}, + "outputs": [], + "source": [ + "# Finally, use intrinsics to project the gripper position in camera frame into pixel space\n", + "pixel_positions = intrinsics_matrix @ robot_gripper_position_cam[:3]\n", + "pixel_positions = pixel_positions[:2] / pixel_positions[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ff8d3c50", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 558/558 [00:04<00:00, 137.22it/s]\n" + ] + } + ], + "source": [ + "# Visualize!\n", + "vis_images = []\n", + "temp_img_path = f\"{path_to_droid_repo}/TEMP.png\"\n", + "\n", + "for i, image in enumerate(tqdm(images)):\n", + " if i % 10 != 0:\n", + " continue\n", + " \n", + " fig, axs = plt.subplots(1)\n", + " x, y = pixel_positions[0, i] / 1280 * 320, pixel_positions[1, i] / 720 * 180 # Scale to match image dimensions\n", + "\n", + " # clip coords\n", + " x = np.clip(x, 0, 320)\n", + " y = np.clip(y, 0, 180)\n", + "\n", + " axs.imshow(image)\n", + " axs.scatter(x, y, c='red', s=20)\n", + " axs.set_xlim(0, 320)\n", + " axs.set_ylim(180, 0) # Invert y-axis to match image\n", + "\n", + " # turn off axes\n", + " axs.axis('off')\n", + "\n", + " # save the figure, then reopen it as PIL image\n", + " plt.savefig(temp_img_path, bbox_inches='tight', pad_inches=0)\n", + " plt.close(fig)\n", + "\n", + " vis_image = Image.open(temp_img_path).convert(\"RGB\")\n", + " vis_images.append(np.array(vis_image))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "72049f59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualize the video\n", + "mediapy.show_video(\n", + " vis_images,\n", + " fps=8\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "steerable-vla-droid", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}