isaac-sim · Mayankm96 · Oct 4, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/source/extensions/omni.isaac.lab/config/extension.toml b/source/extensions/omni.isaac.lab/config/extension.toml
@@ -1,7 +1,7 @@
 [package]
 
 # Note: Semantic Versioning is used: https://semver.org/
-version = "0.24.13"
+version = "0.24.16"
 
 # Description
 title = "Isaac Lab framework for Robot Learning"

diff --git a/source/extensions/omni.isaac.lab/docs/CHANGELOG.rst b/source/extensions/omni.isaac.lab/docs/CHANGELOG.rst
@@ -1,24 +1,36 @@
 Changelog
 ---------
 
-0.22.15 (2024-09-20)
+0.24.16 (2024-10-03)
+~~~~~~~~~~~~~~~~~~~~
+
+Changed
+^^^^^^^
+
+* Renamed the observation function :meth:`grab_images` to :meth:`image` to follow convention of noun-based naming.
+* Renamed the function :meth:`convert_perspective_depth_to_orthogonal_depth` to a shorter name
+  :meth:`omni.isaac.lab.utils.math.orthogonalize_perspective_depth`.
+
+
+0.24.15 (2024-09-20)
 ~~~~~~~~~~~~~~~~~~~~
 
 Added
 ^^^^^
 
-* Added :meth:`grab_images` to be able to use images for an observation term in manager based environments
+* Added :meth:`grab_images` to be able to use images for an observation term in manager-based environments.
+
 
 0.24.14 (2024-09-20)
 ~~~~~~~~~~~~~~~~~~~~
 
 Added
 ^^^^^
 
-* Added :meth:`convert_perspective_depth_to_orthogonal_depth`. :meth:`unproject_depth` assumes
-  that the input depth image is orthogonal. The new :meth:`convert_perspective_depth_to_orthogonal_depth`
-  can be used to convert a perspective depth image into an orthogonal depth image, so that the point cloud
-  can be unprojected correctly with :meth:`unproject_depth`.
+* Added the method :meth:`convert_perspective_depth_to_orthogonal_depth` to convert perspective depth
+  images to orthogonal depth images. This is useful for the :meth:`~omni.isaac.lab.utils.math.unproject_depth`,
+  since it expects orthogonal depth images as inputs.
+
 
 0.24.13 (2024-09-08)
 ~~~~~~~~~~~~~~~~~~~~

diff --git a/source/extensions/omni.isaac.lab/omni/isaac/lab/envs/mdp/observations.py b/source/extensions/omni.isaac.lab/omni/isaac/lab/envs/mdp/observations.py
@@ -182,38 +182,52 @@ def body_incoming_wrench(env: ManagerBasedEnv, asset_cfg: SceneEntityCfg) -> tor
     return link_incoming_forces.view(env.num_envs, -1)
 
 
-def grab_images(
+def image(
     env: ManagerBasedEnv,
     sensor_cfg: SceneEntityCfg = SceneEntityCfg("tiled_camera"),
     data_type: str = "rgb",
     convert_perspective_to_orthogonal: bool = False,
     normalize: bool = True,
 ) -> torch.Tensor:
-    """Grab all of the latest images of a specific datatype produced by a specific camera.
+    """Images of a specific datatype from the camera sensor.
+
+    If the flag :attr:`normalize` is True, post-processing of the images are performed based on their
+    data-types:
+
+    - "rgb": Scales the image to (0, 1) and subtracts with the mean of the current image batch.
+    - "depth" or "distance_to_camera" or "distance_to_plane": Replaces infinity values with zero.
 
     Args:
         env: The environment the cameras are placed within.
         sensor_cfg: The desired sensor to read from. Defaults to SceneEntityCfg("tiled_camera").
         data_type: The data type to pull from the desired camera. Defaults to "rgb".
-        convert_perspective_to_orthogonal: Whether to convert perspective
-            depth images to orthogonal depth images. Defaults to False.
-        normalize: Set to True to normalize images. Defaults to True.
+        convert_perspective_to_orthogonal: Whether to orthogonalize perspective depth images.
+            This is used only when the data type is "distance_to_camera". Defaults to False.
+        normalize: Whether to normalize the images. This depends on the selected data type.
+            Defaults to True.
 
     Returns:
-        The images produced at the last timestep
+        The images produced at the last time-step
     """
+    # extract the used quantities (to enable type-hinting)
     sensor: TiledCamera | Camera | RayCasterCamera = env.scene.sensors[sensor_cfg.name]
+
+    # obtain the input image
     images = sensor.data.output[data_type]
+
+    # depth image conversion
     if (data_type == "distance_to_camera") and convert_perspective_to_orthogonal:
-        images = math_utils.convert_perspective_depth_to_orthogonal_depth(images, sensor.data.intrinsic_matrices)
+        images = math_utils.orthogonalize_perspective_depth(images, sensor.data.intrinsic_matrices)
 
+    # rgb/depth image normalization
     if normalize:
         if data_type == "rgb":
-            images = images / 255
+            images = images.float() / 255.0
             mean_tensor = torch.mean(images, dim=(1, 2), keepdim=True)
             images -= mean_tensor
         elif "distance_to" in data_type or "depth" in data_type:
             images[images == float("inf")] = 0
+
     return images.clone()
 
 

diff --git a/source/extensions/omni.isaac.lab/omni/isaac/lab/utils/math.py b/source/extensions/omni.isaac.lab/omni/isaac/lab/utils/math.py
@@ -987,115 +987,30 @@ def transform_points(
 
 
 @torch.jit.script
-def unproject_depth(depth: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tensor:
-    r"""Unproject depth image into a pointcloud. This method assumes that depth
-    is provided orthogonally relative to the image plane, as opposed to absolutely relative to the camera's
-    principal point (perspective depth). To unproject a perspective depth image, use
-    :meth:`convert_perspective_depth_to_orthogonal_depth` to convert
-    to an orthogonal depth image prior to calling this method. Otherwise, the
-    created point cloud will be distorted, especially around the edges.
+def orthogonalize_perspective_depth(depth: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tensor:
+    """Converts perspective depth image to orthogonal depth image.
 
-    This function converts depth images into points given the calibration matrix of the camera.
-
-    .. math::
-        p_{3D} = K^{-1} \times [u, v, 1]^T \times d
-
-    where :math:`p_{3D}` is the 3D point, :math:`d` is the depth value, :math:`u` and :math:`v` are
-    the pixel coordinates and :math:`K` is the intrinsic matrix.
-
-    If `depth` is a batch of depth images and `intrinsics` is a single intrinsic matrix, the same
-    calibration matrix is applied to all depth images in the batch.
-
-    The function assumes that the width and height are both greater than 1. This makes the function
-    deal with many possible shapes of depth images and intrinsics matrices.
-
-    Args:
-        depth: The depth measurement. Shape is (H, W) or or (H, W, 1) or (N, H, W) or (N, H, W, 1).
-        intrinsics: A tensor providing camera's calibration matrix. Shape is (3, 3) or (N, 3, 3).
-
-    Returns:
-        The 3D coordinates of points. Shape is (P, 3) or (N, P, 3).
-
-    Raises:
-        ValueError: When depth is not of shape (H, W) or (H, W, 1) or (N, H, W) or (N, H, W, 1).
-        ValueError: When intrinsics is not of shape (3, 3) or (N, 3, 3).
-    """
-    depth_batch = depth.clone()
-    intrinsics_batch = intrinsics.clone()
-    # check if inputs are batched
-    is_batched = depth_batch.dim() == 4 or (depth_batch.dim() == 3 and depth_batch.shape[-1] != 1)
-    # make sure inputs are batched
-    if depth_batch.dim() == 3 and depth_batch.shape[-1] == 1:
-        depth_batch = depth_batch.squeeze(dim=2)  # (H, W, 1) -> (H, W)
-    if depth_batch.dim() == 2:
-        depth_batch = depth_batch[None]  # (H, W) -> (1, H, W)
-    if depth_batch.dim() == 4 and depth_batch.shape[-1] == 1:
-        depth_batch = depth_batch.squeeze(dim=3)  # (N, H, W, 1) -> (N, H, W)
-    if intrinsics_batch.dim() == 2:
-        intrinsics_batch = intrinsics_batch[None]  # (3, 3) -> (1, 3, 3)
-    # check shape of inputs
-    if depth_batch.dim() != 3:
-        raise ValueError(f"Expected depth images to have dim = 2 or 3 or 4: got shape {depth.shape}")
-    if intrinsics_batch.dim() != 3:
-        raise ValueError(f"Expected intrinsics to have shape (3, 3) or (N, 3, 3): got shape {intrinsics.shape}")
-
-    # get image height and width
-    im_height, im_width = depth_batch.shape[1:]
-    # create image points in homogeneous coordinates (3, H x W)
-    indices_u = torch.arange(im_width, device=depth.device, dtype=depth.dtype)
-    indices_v = torch.arange(im_height, device=depth.device, dtype=depth.dtype)
-    img_indices = torch.stack(torch.meshgrid([indices_u, indices_v], indexing="ij"), dim=0).reshape(2, -1)
-    pixels = torch.nn.functional.pad(img_indices, (0, 0, 0, 1), mode="constant", value=1.0)
-    pixels = pixels.unsqueeze(0)  # (3, H x W) -> (1, 3, H x W)
-
-    # unproject points into 3D space
-    points = torch.matmul(torch.inverse(intrinsics_batch), pixels)  # (N, 3, H x W)
-    points = points / points[:, -1, :].unsqueeze(1)  # normalize by last coordinate
-    # flatten depth image (N, H, W) -> (N, H x W)
-    depth_batch = depth_batch.transpose_(1, 2).reshape(depth_batch.shape[0], -1).unsqueeze(2)
-    depth_batch = depth_batch.expand(-1, -1, 3)
-    # scale points by depth
-    points_xyz = points.transpose_(1, 2) * depth_batch  # (N, H x W, 3)
-
-    # return points in same shape as input
-    if not is_batched:
-        points_xyz = points_xyz.squeeze(0)
-
-    return points_xyz
-
-
-@torch.jit.script
-def convert_perspective_depth_to_orthogonal_depth(
-    perspective_depth: torch.Tensor, intrinsics: torch.Tensor
-) -> torch.Tensor:
-    r"""Provided depth image(s) where depth is provided as the distance to the principal
-    point of the camera (perspective depth), this function converts it so that depth
-    is provided as the distance to the camera's image plane (orthogonal depth).
-
-    This is helpful because `unproject_depth` assumes that depth is expressed in
-    the orthogonal depth format.
-
-    If `perspective_depth` is a batch of depth images and `intrinsics` is a single intrinsic matrix,
-    the same calibration matrix is applied to all depth images in the batch.
+    Perspective depth images contain distances measured from the camera's optical center.
+    Meanwhile, orthogonal depth images provide the distance from the camera's image plane.
+    This method uses the camera geometry to convert perspective depth to orthogonal depth image.
 
     The function assumes that the width and height are both greater than 1.
 
     Args:
-        perspective_depth: The depth measurement obtained with the distance_to_camera replicator.
-            Shape is (H, W) or or (H, W, 1) or (N, H, W) or (N, H, W, 1).
-        intrinsics: A tensor providing camera's calibration matrix. Shape is (3, 3) or (N, 3, 3).
+        depth: The perspective depth images. Shape is (H, W) or or (H, W, 1) or (N, H, W) or (N, H, W, 1).
+        intrinsics: The camera's calibration matrix. If a single matrix is provided, the same
+            calibration matrix is used across all the depth images in the batch.
+            Shape is (3, 3) or (N, 3, 3).
 
     Returns:
-        The depth image as if obtained by the distance_to_image_plane replicator. Shape
-            matches the input shape of depth
+        The orthogonal depth images. Shape matches the input shape of depth images.
 
     Raises:
         ValueError: When depth is not of shape (H, W) or (H, W, 1) or (N, H, W) or (N, H, W, 1).
         ValueError: When intrinsics is not of shape (3, 3) or (N, 3, 3).
     """
-
     # Clone inputs to avoid in-place modifications
-    perspective_depth_batch = perspective_depth.clone()
+    perspective_depth_batch = depth.clone()
     intrinsics_batch = intrinsics.clone()
 
     # Check if inputs are batched
@@ -1123,7 +1038,7 @@ def convert_perspective_depth_to_orthogonal_depth(
 
     # Validate input shapes
     if perspective_depth_batch.dim() != 3:
-        raise ValueError(f"Expected perspective_depth to have 2, 3, or 4 dimensions; got {perspective_depth.shape}.")
+        raise ValueError(f"Expected depth images to have 2, 3, or 4 dimensions; got {depth.shape}.")
     if intrinsics_batch.dim() != 3:
         raise ValueError(f"Expected intrinsics to have shape (3, 3) or (N, 3, 3); got {intrinsics.shape}.")
 
@@ -1137,8 +1052,8 @@ def convert_perspective_depth_to_orthogonal_depth(
     cy = intrinsics_batch[:, 1, 2].view(-1, 1, 1)
 
     # Create meshgrid of pixel coordinates
-    u_grid = torch.arange(im_width, device=perspective_depth.device, dtype=perspective_depth.dtype)
-    v_grid = torch.arange(im_height, device=perspective_depth.device, dtype=perspective_depth.dtype)
+    u_grid = torch.arange(im_width, device=depth.device, dtype=depth.dtype)
+    v_grid = torch.arange(im_height, device=depth.device, dtype=depth.dtype)
     u_grid, v_grid = torch.meshgrid(u_grid, v_grid, indexing="xy")
 
     # Expand the grids for batch processing
@@ -1150,17 +1065,104 @@ def convert_perspective_depth_to_orthogonal_depth(
     y_term = ((v_grid - cy) / fy) ** 2
 
     # Calculate the orthogonal (normal) depth
-    normal_depth = perspective_depth_batch / torch.sqrt(1 + x_term + y_term)
+    orthogonal_depth = perspective_depth_batch / torch.sqrt(1 + x_term + y_term)
 
     # Restore the last dimension if it was present in the input
     if add_last_dim:
-        normal_depth = normal_depth.unsqueeze(-1)
+        orthogonal_depth = orthogonal_depth.unsqueeze(-1)
 
     # Return to original shape if input was not batched
     if not is_batched:
-        normal_depth = normal_depth.squeeze(0)
+        orthogonal_depth = orthogonal_depth.squeeze(0)
 
-    return normal_depth
+    return orthogonal_depth
+
+
+@torch.jit.script
+def unproject_depth(depth: torch.Tensor, intrinsics: torch.Tensor, is_ortho: bool = True) -> torch.Tensor:
+    r"""Un-project depth image into a pointcloud.
+
+    This function converts orthogonal or perspective depth images into points given the calibration matrix
+    of the camera. It uses the following transformation based on camera geometry:
+
+    .. math::
+        p_{3D} = K^{-1} \times [u, v, 1]^T \times d
+
+    where :math:`p_{3D}` is the 3D point, :math:`d` is the depth value (measured from the image plane),
+    :math:`u` and :math:`v` are the pixel coordinates and :math:`K` is the intrinsic matrix.
+
+    The function assumes that the width and height are both greater than 1. This makes the function
+    deal with many possible shapes of depth images and intrinsics matrices.
+
+    .. note::
+        If :attr:`is_ortho` is False, the input depth images are transformed to orthogonal depth images
+        by using the :meth:`orthogonalize_perspective_depth` method.
+
+    Args:
+        depth: The depth measurement. Shape is (H, W) or or (H, W, 1) or (N, H, W) or (N, H, W, 1).
+        intrinsics: The camera's calibration matrix. If a single matrix is provided, the same
+            calibration matrix is used across all the depth images in the batch.
+            Shape is (3, 3) or (N, 3, 3).
+        is_ortho: Whether the input depth image is orthogonal or perspective depth image. If True, the input
+            depth image is considered as the *orthogonal* type, where the measurements are from the camera's
+            image plane. If False, the depth image is considered as the *perspective* type, where the
+            measurements are from the camera's optical center. Defaults to True.
+
+    Returns:
+        The 3D coordinates of points. Shape is (P, 3) or (N, P, 3).
+
+    Raises:
+        ValueError: When depth is not of shape (H, W) or (H, W, 1) or (N, H, W) or (N, H, W, 1).
+        ValueError: When intrinsics is not of shape (3, 3) or (N, 3, 3).
+    """
+    # clone inputs to avoid in-place modifications
+    intrinsics_batch = intrinsics.clone()
+    # convert depth image to orthogonal if needed
+    if not is_ortho:
+        depth_batch = orthogonalize_perspective_depth(depth, intrinsics)
+    else:
+        depth_batch = depth.clone()
+
+    # check if inputs are batched
+    is_batched = depth_batch.dim() == 4 or (depth_batch.dim() == 3 and depth_batch.shape[-1] != 1)
+    # make sure inputs are batched
+    if depth_batch.dim() == 3 and depth_batch.shape[-1] == 1:
+        depth_batch = depth_batch.squeeze(dim=2)  # (H, W, 1) -> (H, W)
+    if depth_batch.dim() == 2:
+        depth_batch = depth_batch[None]  # (H, W) -> (1, H, W)
+    if depth_batch.dim() == 4 and depth_batch.shape[-1] == 1:
+        depth_batch = depth_batch.squeeze(dim=3)  # (N, H, W, 1) -> (N, H, W)
+    if intrinsics_batch.dim() == 2:
+        intrinsics_batch = intrinsics_batch[None]  # (3, 3) -> (1, 3, 3)
+    # check shape of inputs
+    if depth_batch.dim() != 3:
+        raise ValueError(f"Expected depth images to have dim = 2 or 3 or 4: got shape {depth.shape}")
+    if intrinsics_batch.dim() != 3:
+        raise ValueError(f"Expected intrinsics to have shape (3, 3) or (N, 3, 3): got shape {intrinsics.shape}")
+
+    # get image height and width
+    im_height, im_width = depth_batch.shape[1:]
+    # create image points in homogeneous coordinates (3, H x W)
+    indices_u = torch.arange(im_width, device=depth.device, dtype=depth.dtype)
+    indices_v = torch.arange(im_height, device=depth.device, dtype=depth.dtype)
+    img_indices = torch.stack(torch.meshgrid([indices_u, indices_v], indexing="ij"), dim=0).reshape(2, -1)
+    pixels = torch.nn.functional.pad(img_indices, (0, 0, 0, 1), mode="constant", value=1.0)
+    pixels = pixels.unsqueeze(0)  # (3, H x W) -> (1, 3, H x W)
+
+    # unproject points into 3D space
+    points = torch.matmul(torch.inverse(intrinsics_batch), pixels)  # (N, 3, H x W)
+    points = points / points[:, -1, :].unsqueeze(1)  # normalize by last coordinate
+    # flatten depth image (N, H, W) -> (N, H x W)
+    depth_batch = depth_batch.transpose_(1, 2).reshape(depth_batch.shape[0], -1).unsqueeze(2)
+    depth_batch = depth_batch.expand(-1, -1, 3)
+    # scale points by depth
+    points_xyz = points.transpose_(1, 2) * depth_batch  # (N, H x W, 3)
+
+    # return points in same shape as input
+    if not is_batched:
+        points_xyz = points_xyz.squeeze(0)
+
+    return points_xyz
 
 
 @torch.jit.script
@@ -1191,8 +1193,10 @@ def project_points(points: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tens
     Returns:
         Projected 3D coordinates of points. Shape is (P, 3) or (N, P, 3).
     """
+    # clone the inputs to avoid in-place operations modifying the original data
     points_batch = points.clone()
     intrinsics_batch = intrinsics.clone()
+
     # check if inputs are batched
     is_batched = points_batch.dim() == 2
     # make sure inputs are batched
@@ -1205,12 +1209,14 @@ def project_points(points: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tens
         raise ValueError(f"Expected points to have dim = 3: got shape {points.shape}.")
     if intrinsics_batch.dim() != 3:
         raise ValueError(f"Expected intrinsics to have shape (3, 3) or (N, 3, 3): got shape {intrinsics.shape}.")
+
     # project points into 2D image plane
     points_2d = torch.matmul(intrinsics_batch, points_batch.transpose(1, 2))
     points_2d = points_2d / points_2d[:, -1, :].unsqueeze(1)  # normalize by last coordinate
     points_2d = points_2d.transpose_(1, 2)  # (N, 3, P) -> (N, P, 3)
     # replace last coordinate with depth
     points_2d[:, :, -1] = points_batch[:, :, -1]
+
     # return points in same shape as input
     if not is_batched:
         points_2d = points_2d.squeeze(0)  # (1, 3, P) -> (3, P)