From cd547630d92a78b426f5ecfd2a38835829366e21 Mon Sep 17 00:00:00 2001
From: Yaoyao Ding <dingyaoyao.cs@gmail.com>
Date: Sat, 6 Sep 2025 12:57:21 -0400
Subject: [PATCH 1/3] update

---
 ffi/examples/inline_module/main.py    |  13 +--
 ffi/python/tvm_ffi/cpp/load_inline.py | 120 ++++++++++++----------
 ffi/tests/python/test_load_inline.py  | 140 ++++++++++++++++++++++----
 3 files changed, 196 insertions(+), 77 deletions(-)

diff --git a/ffi/examples/inline_module/main.py b/ffi/examples/inline_module/main.py
index 574d55c67824..7a0700b55850 100644
--- a/ffi/examples/inline_module/main.py
+++ b/ffi/examples/inline_module/main.py
@@ -23,8 +23,8 @@
 def main():
     mod: Module = tvm_ffi.cpp.load_inline(
         name="hello",
-        cpp_source=r"""
-            void AddOne(DLTensor* x, DLTensor* y) {
+        cpp_sources=r"""
+            void add_one_cpu(DLTensor* x, DLTensor* y) {
               // implementation of a library function
               TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
               DLDataType f32_dtype{kDLFloat, 32, 1};
@@ -36,8 +36,10 @@ def main():
                 static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
               }
             }
+            
+            void add_one_cuda(DLTensor* x, DLTensor* y);
         """,
-        cuda_source=r"""
+        cuda_sources=r"""
             __global__ void AddOneKernel(float* x, float* y, int n) {
               int idx = blockIdx.x * blockDim.x + threadIdx.x;
               if (idx < n) {
@@ -45,7 +47,7 @@ def main():
               }
             }
 
-            void AddOneCUDA(DLTensor* x, DLTensor* y) {
+            void add_one_cuda(DLTensor* x, DLTensor* y) {
               // implementation of a library function
               TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
               DLDataType f32_dtype{kDLFloat, 32, 1};
@@ -67,8 +69,7 @@ def main():
                                                                      static_cast<float*>(y->data), n);
             }
         """,
-        cpp_functions={"add_one_cpu": "AddOne"},
-        cuda_functions={"add_one_cuda": "AddOneCUDA"},
+        functions=["add_one_cpu", "add_one_cuda"],
     )
 
     x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
diff --git a/ffi/python/tvm_ffi/cpp/load_inline.py b/ffi/python/tvm_ffi/cpp/load_inline.py
index a9ec1c39977d..f655250aa440 100644
--- a/ffi/python/tvm_ffi/cpp/load_inline.py
+++ b/ffi/python/tvm_ffi/cpp/load_inline.py
@@ -34,8 +34,7 @@
 def _hash_sources(
     cpp_source: str,
     cuda_source: str,
-    cpp_functions: Mapping[str, str],
-    cuda_functions: Mapping[str, str],
+    functions: Sequence[str] | Mapping[str, str],
     extra_cflags: Sequence[str],
     extra_cuda_cflags: Sequence[str],
     extra_ldflags: Sequence[str],
@@ -45,12 +44,13 @@ def _hash_sources(
     m = hashlib.sha256()
     m.update(cpp_source.encode("utf-8"))
     m.update(cuda_source.encode("utf-8"))
-    for name, doc in sorted(cpp_functions.items()):
-        m.update(name.encode("utf-8"))
-        m.update(doc.encode("utf-8"))
-    for name, doc in sorted(cuda_functions.items()):
-        m.update(name.encode("utf-8"))
-        m.update(doc.encode("utf-8"))
+    if isinstance(functions, Mapping):
+        for name in sorted(functions):
+            m.update(name.encode("utf-8"))
+            m.update(functions[name].encode("utf-8"))
+    else:
+        for name in sorted(functions):
+            m.update(name.encode("utf-8"))
     for flag in extra_cflags:
         m.update(flag.encode("utf-8"))
     for flag in extra_cuda_cflags:
@@ -242,8 +242,10 @@ def _decorate_with_tvm_ffi(source: str, functions: Mapping[str, str]) -> str:
         source,
     ]
 
-    for exported_name, func_name_in_source in functions.items():
-        sources.append(f"TVM_FFI_DLL_EXPORT_TYPED_FUNC({exported_name}, {func_name_in_source});")
+    for func_name, func_doc in functions.items():
+        sources.append(f"TVM_FFI_DLL_EXPORT_TYPED_FUNC({func_name}, {func_name});")
+        _ = func_doc  # todo: add support to embed function docstring to the tvm ffi functions.
+
     sources.append("")
 
     return "\n".join(sources)
@@ -252,14 +254,14 @@ def _decorate_with_tvm_ffi(source: str, functions: Mapping[str, str]) -> str:
 def load_inline(
     name: str,
     *,
-    cpp_source: str | None = None,
-    cuda_source: str | None = None,
-    cpp_functions: Mapping[str, str] | None = None,
-    cuda_functions: Mapping[str, str] | None = None,
+    cpp_sources: str | None = None,
+    cuda_sources: str | None = None,
+    functions: Sequence[str] | None = None,
     extra_cflags: Sequence[str] | None = None,
     extra_cuda_cflags: Sequence[str] | None = None,
     extra_ldflags: Sequence[str] | None = None,
     extra_include_paths: Sequence[str] | None = None,
+    build_directory: Optional[str] = None,
 ) -> Module:
     """Compile and load a C++/CUDA tvm ffi module from inline source code.
 
@@ -281,22 +283,24 @@ def load_inline(
     any header from tvm ffi and dlpack in your source code. You can also provide additional include paths via the
     `extra_include_paths` parameter and include custom headers in your source code.
 
-    The compiled shared library is cached in a cache directory to avoid recompilation. The cache directory can be
-    specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified, the default cache directory is
-    `~/.cache/tvm-ffi`.
+    The compiled shared library is cached in a cache directory to avoid recompilation. The `build_directory` parameter
+    is provided to specify the build directory. If not specified, a default tvm ffi cache directory will be used.
+    The default cache directory can be specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified,
+    the default cache directory is `~/.cache/tvm-ffi`.
 
     Parameters
     ----------
     name: str
         The name of the tvm ffi module.
-    cpp_source: str, optional
-        The C++ source code.
-    cuda_source: str, optional
-        The CUDA source code.
-    cpp_functions: Mapping[str, str], optional
-        The mapping from the exported function name to the function name in the C++ source code.
-    cuda_functions: Mapping[str, str], optional
-        The mapping from the exported function name to the function name in the CUDA source code.
+    cpp_sources: Sequence[str] | str, optional
+        The C++ source code. It can be a list of sources or a single source.
+    cuda_sources: Sequence[str] | str, optional
+        The CUDA source code. It can be a list of sources or a single source.
+    functions: Mapping[str, str] | Sequence[str] | str, optional
+        The functions in cpp_sources that will be exported to the tvm ffi module. When a mapping is given, the keys
+        are the names of the exported functions, and the values are docstrings for the functions. When a sequence or a
+        single string is given, they are the functions needed to be exported, and the docstrings are set to empty
+        strings. A single function name can also be given as a string.
     extra_cflags: Sequence[str], optional
         The extra compiler flags for C++ compilation.
         The default flags are:
@@ -316,46 +320,58 @@ def load_inline(
         The extra include paths.
         The default include paths are:
         - The include path of tvm ffi
+    build_directory: str, optional
+        The build directory. If not specified, a default tvm ffi cache directory will be used. By default, the
+        cache directory is `~/.cache/tvm-ffi`. You can also set the `TVM_FFI_CACHE_DIR` environment variable to
+        specify the cache directory.
+
     Returns
     -------
     mod: Module
         The loaded tvm ffi module.
     """
-    if cpp_source is None:
-        cpp_source = ""
-    if cuda_source is None:
-        cuda_source = ""
-    if cpp_functions is None:
-        cpp_functions = {}
-    if cuda_functions is None:
-        cuda_functions = {}
+    if cpp_sources is None:
+        cpp_sources = []
+    elif isinstance(cpp_sources, str):
+        cpp_sources = [cpp_sources]
+    cpp_source = "\n".join(cpp_sources)
+    if cuda_sources is None:
+        cuda_sources = []
+    elif isinstance(cuda_sources, str):
+        cuda_sources = [cuda_sources]
+    cuda_source = "\n".join(cuda_sources)
+    with_cuda = len(cuda_sources) > 0
+
     extra_ldflags = extra_ldflags or []
     extra_cflags = extra_cflags or []
     extra_cuda_cflags = extra_cuda_cflags or []
     extra_include_paths = extra_include_paths or []
 
-    # whether we have cuda source in this module
-    with_cuda = len(cuda_source.strip()) > 0
-
     # add function registration code to sources
-    cpp_source = _decorate_with_tvm_ffi(cpp_source, cpp_functions)
-    cuda_source = _decorate_with_tvm_ffi(cuda_source, cuda_functions)
+    if isinstance(functions, str):
+        functions = {functions: ""}
+    elif isinstance(functions, Sequence):
+        functions = {name: "" for name in functions}
+    cpp_source = _decorate_with_tvm_ffi(cpp_source, functions)
+    cuda_source = _decorate_with_tvm_ffi(cuda_source, {})
 
     # determine the cache dir for the built module
-    cache_dir = os.path.join(
-        os.environ.get("TVM_FFI_CACHE_DIR", os.path.expanduser("~/.cache/tvm-ffi"))
-    )
-    source_hash: str = _hash_sources(
-        cpp_source,
-        cuda_source,
-        cpp_functions,
-        cuda_functions,
-        extra_cflags,
-        extra_cuda_cflags,
-        extra_ldflags,
-        extra_include_paths,
-    )
-    build_dir: str = os.path.join(cache_dir, "{}_{}".format(name, source_hash))
+    if build_directory is None:
+        build_directory = os.environ.get(
+            "TVM_FFI_CACHE_DIR", os.path.expanduser("~/.cache/tvm-ffi")
+        )
+        source_hash: str = _hash_sources(
+            cpp_source,
+            cuda_source,
+            functions,
+            extra_cflags,
+            extra_cuda_cflags,
+            extra_ldflags,
+            extra_include_paths,
+        )
+        build_dir: str = os.path.join(build_directory, "{}_{}".format(name, source_hash))
+    else:
+        build_dir = os.path.abspath(build_directory)
     os.makedirs(build_dir, exist_ok=True)
 
     # generate build.ninja
diff --git a/ffi/tests/python/test_load_inline.py b/ffi/tests/python/test_load_inline.py
index bb14ae9792c2..f51eacd5c026 100644
--- a/ffi/tests/python/test_load_inline.py
+++ b/ffi/tests/python/test_load_inline.py
@@ -30,8 +30,8 @@
 def test_load_inline_cpp():
     mod: Module = tvm_ffi.cpp.load_inline(
         name="hello",
-        cpp_source=r"""
-            void AddOne(DLTensor* x, DLTensor* y) {
+        cpp_sources=r"""
+            void add_one_cpu(DLTensor* x, DLTensor* y) {
               // implementation of a library function
               TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
               DLDataType f32_dtype{kDLFloat, 32, 1};
@@ -44,7 +44,7 @@ def test_load_inline_cpp():
               }
             }
         """,
-        cpp_functions={"add_one_cpu": "AddOne"},
+        functions=["add_one_cpu"],
     )
 
     x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
@@ -53,11 +53,111 @@ def test_load_inline_cpp():
     numpy.testing.assert_equal(x + 1, y)
 
 
-@pytest.mark.skip(reason="Requires CUDA")
+def test_load_inline_cpp_with_docstrings():
+    mod: Module = tvm_ffi.cpp.load_inline(
+        name="hello",
+        cpp_sources=r"""
+            void add_one_cpu(DLTensor* x, DLTensor* y) {
+              // implementation of a library function
+              TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
+              DLDataType f32_dtype{kDLFloat, 32, 1};
+              TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor";
+              TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor";
+              TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor";
+              TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape";
+              for (int i = 0; i < x->shape[0]; ++i) {
+                static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
+              }
+            }
+        """,
+        functions={"add_one_cpu": "add two float32 1D tensors element-wise"},
+    )
+
+    x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
+    y = numpy.empty_like(x)
+    mod.add_one_cpu(x, y)
+    numpy.testing.assert_equal(x + 1, y)
+
+
+def test_load_inline_cpp_multiple_sources():
+    mod: Module = tvm_ffi.cpp.load_inline(
+        name="hello",
+        cpp_sources=[
+            r"""
+            void add_one_cpu(DLTensor* x, DLTensor* y) {
+              // implementation of a library function
+              TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
+              DLDataType f32_dtype{kDLFloat, 32, 1};
+              TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor";
+              TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor";
+              TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor";
+              TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape";
+              for (int i = 0; i < x->shape[0]; ++i) {
+                static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
+              }
+            }
+        """,
+            r"""
+            void add_two_cpu(DLTensor* x, DLTensor* y) {
+              // implementation of a library function
+              TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
+              DLDataType f32_dtype{kDLFloat, 32, 1};
+              TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor";
+              TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor";
+              TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor";
+              TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape";
+              for (int i = 0; i < x->shape[0]; ++i) {
+                static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 2;
+              }
+            }
+        """,
+        ],
+        functions=["add_one_cpu", "add_two_cpu"],
+    )
+
+    x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
+    y = numpy.empty_like(x)
+    mod.add_one_cpu(x, y)
+    numpy.testing.assert_equal(x + 1, y)
+
+
+def test_load_inline_cpp_build_dir():
+    mod: Module = tvm_ffi.cpp.load_inline(
+        name="hello",
+        cpp_sources=r"""
+            void add_one_cpu(DLTensor* x, DLTensor* y) {
+              // implementation of a library function
+              TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
+              DLDataType f32_dtype{kDLFloat, 32, 1};
+              TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor";
+              TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor";
+              TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor";
+              TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape";
+              for (int i = 0; i < x->shape[0]; ++i) {
+                static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
+              }
+            }
+        """,
+        functions=["add_one_cpu"],
+        build_directory="./build_add_one",
+    )
+
+    x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
+    y = numpy.empty_like(x)
+    mod.add_one_cpu(x, y)
+    numpy.testing.assert_equal(x + 1, y)
+
+
+@pytest.mark.skipif(
+    torch is None or not torch.cuda.is_available(), reason="Requires torch and CUDA"
+)
 def test_load_inline_cuda():
     mod: Module = tvm_ffi.cpp.load_inline(
         name="hello",
-        cuda_source=r"""
+        cpp_sources=r"""
+            void add_one_cuda(DLTensor* x, DLTensor* y);
+        """,
+        cuda_sources=r"""
             __global__ void AddOneKernel(float* x, float* y, int n) {
               int idx = blockIdx.x * blockDim.x + threadIdx.x;
               if (idx < n) {
@@ -65,7 +165,7 @@ def test_load_inline_cuda():
               }
             }
 
-            void AddOneCUDA(DLTensor* x, DLTensor* y) {
+            void add_one_cuda(DLTensor* x, DLTensor* y) {
               // implementation of a library function
               TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
               DLDataType f32_dtype{kDLFloat, 32, 1};
@@ -87,7 +187,7 @@ def test_load_inline_cuda():
                                                                      static_cast<float*>(y->data), n);
             }
         """,
-        cuda_functions={"add_one_cuda": "AddOneCUDA"},
+        functions=["add_one_cuda"],
     )
 
     if torch is not None:
@@ -97,12 +197,14 @@ def test_load_inline_cuda():
         torch.testing.assert_close(x_cuda + 1, y_cuda)
 
 
-@pytest.mark.skip(reason="Requires CUDA")
+@pytest.mark.skipif(
+    torch is None or not torch.cuda.is_available(), reason="Requires torch and CUDA"
+)
 def test_load_inline_both():
     mod: Module = tvm_ffi.cpp.load_inline(
         name="hello",
-        cpp_source=r"""
-            void AddOne(DLTensor* x, DLTensor* y) {
+        cpp_sources=r"""
+            void add_one_cpu(DLTensor* x, DLTensor* y) {
               // implementation of a library function
               TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
               DLDataType f32_dtype{kDLFloat, 32, 1};
@@ -114,8 +216,10 @@ def test_load_inline_both():
                 static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
               }
             }
+            
+            void add_one_cuda(DLTensor* x, DLTensor* y);
         """,
-        cuda_source=r"""
+        cuda_sources=r"""
             __global__ void AddOneKernel(float* x, float* y, int n) {
               int idx = blockIdx.x * blockDim.x + threadIdx.x;
               if (idx < n) {
@@ -123,7 +227,7 @@ def test_load_inline_both():
               }
             }
 
-            void AddOneCUDA(DLTensor* x, DLTensor* y) {
+            void add_one_cuda(DLTensor* x, DLTensor* y) {
               // implementation of a library function
               TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
               DLDataType f32_dtype{kDLFloat, 32, 1};
@@ -145,8 +249,7 @@ def test_load_inline_both():
                                                                      static_cast<float*>(y->data), n);
             }
         """,
-        cpp_functions={"add_one_cpu": "AddOne"},
-        cuda_functions={"add_one_cuda": "AddOneCUDA"},
+        functions=["add_one_cpu", "add_one_cuda"],
     )
 
     x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
@@ -154,8 +257,7 @@ def test_load_inline_both():
     mod.add_one_cpu(x, y)
     numpy.testing.assert_equal(x + 1, y)
 
-    if torch is not None:
-        x_cuda = torch.asarray([1, 2, 3, 4, 5], dtype=torch.float32, device="cuda")
-        y_cuda = torch.empty_like(x_cuda)
-        mod.add_one_cuda(x_cuda, y_cuda)
-        torch.testing.assert_close(x_cuda + 1, y_cuda)
+    x_cuda = torch.asarray([1, 2, 3, 4, 5], dtype=torch.float32, device="cuda")
+    y_cuda = torch.empty_like(x_cuda)
+    mod.add_one_cuda(x_cuda, y_cuda)
+    torch.testing.assert_close(x_cuda + 1, y_cuda)

From 40ff65d9797590e6044fe7784d345169546e1c02 Mon Sep 17 00:00:00 2001
From: Yaoyao Ding <dingyaoyao.cs@gmail.com>
Date: Sat, 6 Sep 2025 13:03:29 -0400
Subject: [PATCH 2/3] update docs

---
 ffi/python/tvm_ffi/cpp/load_inline.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ffi/python/tvm_ffi/cpp/load_inline.py b/ffi/python/tvm_ffi/cpp/load_inline.py
index f655250aa440..61b3a74fce2c 100644
--- a/ffi/python/tvm_ffi/cpp/load_inline.py
+++ b/ffi/python/tvm_ffi/cpp/load_inline.py
@@ -265,15 +265,15 @@ def load_inline(
 ) -> Module:
     """Compile and load a C++/CUDA tvm ffi module from inline source code.
 
-    This function compiles the given C++ and/or CUDA source code into a shared library. Both cpp_source and cuda_source
-    are compiled to an object file, and then linked together into a shared library. It's possible to only provide
-    cpp_source or cuda_source.
-
-    The `cpp_functions` and `cuda_functions` parameters are used to specify which functions in the source code
-    should be exported to the tvm ffi module. The keys of the mapping are the names of the exported functions, and the
-    values are the names of the functions in the source code. The exported name and the function name in the source code
-    must be different. The exported name must be a valid C identifier while the function name in the source code can
-    contain namespace qualifiers.
+    This function compiles the given C++ and/or CUDA source code into a shared library. Both cpp_sources and
+    cuda_sources are compiled to an object file, and then linked together into a shared library. It's possible to only
+    provide cpp_sources or cuda_sources.
+
+    The `functions` parameter is used to specify which functions in the source code should be exported to the tvm ffi module.
+    It can be a mapping, a sequence, or a single string. When a mapping is given, the keys are the names of the exported
+    functions, and the values are docstrings for the functions. When a sequence or a single string is given, they are the
+    functions needed to be exported, and the docstrings are set to empty strings. A single function name can also be given
+    as a string, indicating that only one function is to be exported.
 
     Extra compiler and linker flags can be provided via the `extra_cflags`, `extra_cuda_cflags`, and `extra_ldflags`
     parameters. The default flags are generally sufficient for most use cases, but you may need to provide additional

From 65381b06fd1ea85579e9679ac08f842045a71936 Mon Sep 17 00:00:00 2001
From: Yaoyao Ding <dingyaoyao.cs@gmail.com>
Date: Sat, 6 Sep 2025 14:14:33 -0400
Subject: [PATCH 3/3] remove whitespace trailing

---
 ffi/examples/inline_module/main.py   | 2 +-
 ffi/tests/python/test_load_inline.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ffi/examples/inline_module/main.py b/ffi/examples/inline_module/main.py
index 7a0700b55850..b55574ae7bab 100644
--- a/ffi/examples/inline_module/main.py
+++ b/ffi/examples/inline_module/main.py
@@ -36,7 +36,7 @@ def main():
                 static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
               }
             }
-            
+
             void add_one_cuda(DLTensor* x, DLTensor* y);
         """,
         cuda_sources=r"""
diff --git a/ffi/tests/python/test_load_inline.py b/ffi/tests/python/test_load_inline.py
index f51eacd5c026..f809cede5927 100644
--- a/ffi/tests/python/test_load_inline.py
+++ b/ffi/tests/python/test_load_inline.py
@@ -216,7 +216,7 @@ def test_load_inline_both():
                 static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
               }
             }
-            
+
             void add_one_cuda(DLTensor* x, DLTensor* y);
         """,
         cuda_sources=r"""