From 9e2e0aeddef0cd3e22055f7d4ceb0b1506e24369 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 31 May 2025 01:07:24 +0000
Subject: [PATCH] bump dlpack header to 1.1

---
 cuda_core/cuda/core/experimental/dlpack.h | 42 ++++++++++++++++++++---
 1 file changed, 38 insertions(+), 4 deletions(-)
diff --git a/cuda_core/cuda/core/experimental/dlpack.h b/cuda_core/cuda/core/experimental/dlpack.h
index bcb77949a..0b41961b4 100644
--- a/cuda_core/cuda/core/experimental/dlpack.h
+++ b/cuda_core/cuda/core/experimental/dlpack.h
@@ -19,7 +19,7 @@
 #define DLPACK_MAJOR_VERSION 1
 
 /*! \brief The current minor version of dlpack */
-#define DLPACK_MINOR_VERSION 0
+#define DLPACK_MINOR_VERSION 1
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -157,6 +157,26 @@ typedef enum {
   kDLComplex = 5U,
   /*! \brief boolean */
   kDLBool = 6U,
+  /*! \brief FP8 data types */
+  kDLFloat8_e3m4 = 7U,
+  kDLFloat8_e4m3 = 8U,
+  kDLFloat8_e4m3b11fnuz = 9U,
+  kDLFloat8_e4m3fn = 10U,
+  kDLFloat8_e4m3fnuz = 11U,
+  kDLFloat8_e5m2 = 12U,
+  kDLFloat8_e5m2fnuz = 13U,
+  kDLFloat8_e8m0fnu = 14U,
+  /*! \brief FP6 data types
+   * Setting bits != 6 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat6_e2m3fn = 15U,
+  kDLFloat6_e3m2fn = 16U,
+  /*! \brief FP4 data types
+   * Setting bits != 4 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat4_e2m1fn = 17U,
 } DLDataTypeCode;
 
 /*!
@@ -170,6 +190,12 @@ typedef enum {
  *   - int8: type_code = 0, bits = 8, lanes = 1
  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
  *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+ *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+ *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+ *
+ *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
+ *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
  */
 typedef struct {
   /*!
@@ -196,8 +222,8 @@ typedef struct {
    * types. This pointer is always aligned to 256 bytes as in CUDA. The
    * `byte_offset` field should be used to point to the beginning of the data.
    *
-   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
    * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
    * (after which this note will be updated); at the moment it is recommended
    * to not rely on the data pointer being correctly aligned.
@@ -267,7 +293,7 @@ typedef struct DLManagedTensor {
   void (*deleter)(struct DLManagedTensor * self);
 } DLManagedTensor;
 
-// bit masks used in in the DLManagedTensorVersioned
+// bit masks used in the DLManagedTensorVersioned
 
 /*! \brief bit mask to indicate that the tensor is read only. */
 #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@@ -280,6 +306,14 @@ typedef struct DLManagedTensor {
  */
 #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
 
+/*!
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
 /*!
  * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
  *