From cd547630d92a78b426f5ecfd2a38835829366e21 Mon Sep 17 00:00:00 2001 From: Yaoyao Ding Date: Sat, 6 Sep 2025 12:57:21 -0400 Subject: [PATCH 1/3] update --- ffi/examples/inline_module/main.py | 13 +-- ffi/python/tvm_ffi/cpp/load_inline.py | 120 ++++++++++++---------- ffi/tests/python/test_load_inline.py | 140 ++++++++++++++++++++++---- 3 files changed, 196 insertions(+), 77 deletions(-) diff --git a/ffi/examples/inline_module/main.py b/ffi/examples/inline_module/main.py index 574d55c67824..7a0700b55850 100644 --- a/ffi/examples/inline_module/main.py +++ b/ffi/examples/inline_module/main.py @@ -23,8 +23,8 @@ def main(): mod: Module = tvm_ffi.cpp.load_inline( name="hello", - cpp_source=r""" - void AddOne(DLTensor* x, DLTensor* y) { + cpp_sources=r""" + void add_one_cpu(DLTensor* x, DLTensor* y) { // implementation of a library function TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; DLDataType f32_dtype{kDLFloat, 32, 1}; @@ -36,8 +36,10 @@ def main(): static_cast(y->data)[i] = static_cast(x->data)[i] + 1; } } + + void add_one_cuda(DLTensor* x, DLTensor* y); """, - cuda_source=r""" + cuda_sources=r""" __global__ void AddOneKernel(float* x, float* y, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { @@ -45,7 +47,7 @@ def main(): } } - void AddOneCUDA(DLTensor* x, DLTensor* y) { + void add_one_cuda(DLTensor* x, DLTensor* y) { // implementation of a library function TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; DLDataType f32_dtype{kDLFloat, 32, 1}; @@ -67,8 +69,7 @@ def main(): static_cast(y->data), n); } """, - cpp_functions={"add_one_cpu": "AddOne"}, - cuda_functions={"add_one_cuda": "AddOneCUDA"}, + functions=["add_one_cpu", "add_one_cuda"], ) x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32) diff --git a/ffi/python/tvm_ffi/cpp/load_inline.py b/ffi/python/tvm_ffi/cpp/load_inline.py index a9ec1c39977d..f655250aa440 100644 --- a/ffi/python/tvm_ffi/cpp/load_inline.py +++ b/ffi/python/tvm_ffi/cpp/load_inline.py @@ -34,8 +34,7 @@ def _hash_sources( cpp_source: str, cuda_source: str, - cpp_functions: Mapping[str, str], - cuda_functions: Mapping[str, str], + functions: Sequence[str] | Mapping[str, str], extra_cflags: Sequence[str], extra_cuda_cflags: Sequence[str], extra_ldflags: Sequence[str], @@ -45,12 +44,13 @@ def _hash_sources( m = hashlib.sha256() m.update(cpp_source.encode("utf-8")) m.update(cuda_source.encode("utf-8")) - for name, doc in sorted(cpp_functions.items()): - m.update(name.encode("utf-8")) - m.update(doc.encode("utf-8")) - for name, doc in sorted(cuda_functions.items()): - m.update(name.encode("utf-8")) - m.update(doc.encode("utf-8")) + if isinstance(functions, Mapping): + for name in sorted(functions): + m.update(name.encode("utf-8")) + m.update(functions[name].encode("utf-8")) + else: + for name in sorted(functions): + m.update(name.encode("utf-8")) for flag in extra_cflags: m.update(flag.encode("utf-8")) for flag in extra_cuda_cflags: @@ -242,8 +242,10 @@ def _decorate_with_tvm_ffi(source: str, functions: Mapping[str, str]) -> str: source, ] - for exported_name, func_name_in_source in functions.items(): - sources.append(f"TVM_FFI_DLL_EXPORT_TYPED_FUNC({exported_name}, {func_name_in_source});") + for func_name, func_doc in functions.items(): + sources.append(f"TVM_FFI_DLL_EXPORT_TYPED_FUNC({func_name}, {func_name});") + _ = func_doc # todo: add support to embed function docstring to the tvm ffi functions. + sources.append("") return "\n".join(sources) @@ -252,14 +254,14 @@ def _decorate_with_tvm_ffi(source: str, functions: Mapping[str, str]) -> str: def load_inline( name: str, *, - cpp_source: str | None = None, - cuda_source: str | None = None, - cpp_functions: Mapping[str, str] | None = None, - cuda_functions: Mapping[str, str] | None = None, + cpp_sources: str | None = None, + cuda_sources: str | None = None, + functions: Sequence[str] | None = None, extra_cflags: Sequence[str] | None = None, extra_cuda_cflags: Sequence[str] | None = None, extra_ldflags: Sequence[str] | None = None, extra_include_paths: Sequence[str] | None = None, + build_directory: Optional[str] = None, ) -> Module: """Compile and load a C++/CUDA tvm ffi module from inline source code. @@ -281,22 +283,24 @@ def load_inline( any header from tvm ffi and dlpack in your source code. You can also provide additional include paths via the `extra_include_paths` parameter and include custom headers in your source code. - The compiled shared library is cached in a cache directory to avoid recompilation. The cache directory can be - specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified, the default cache directory is - `~/.cache/tvm-ffi`. + The compiled shared library is cached in a cache directory to avoid recompilation. The `build_directory` parameter + is provided to specify the build directory. If not specified, a default tvm ffi cache directory will be used. + The default cache directory can be specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified, + the default cache directory is `~/.cache/tvm-ffi`. Parameters ---------- name: str The name of the tvm ffi module. - cpp_source: str, optional - The C++ source code. - cuda_source: str, optional - The CUDA source code. - cpp_functions: Mapping[str, str], optional - The mapping from the exported function name to the function name in the C++ source code. - cuda_functions: Mapping[str, str], optional - The mapping from the exported function name to the function name in the CUDA source code. + cpp_sources: Sequence[str] | str, optional + The C++ source code. It can be a list of sources or a single source. + cuda_sources: Sequence[str] | str, optional + The CUDA source code. It can be a list of sources or a single source. + functions: Mapping[str, str] | Sequence[str] | str, optional + The functions in cpp_sources that will be exported to the tvm ffi module. When a mapping is given, the keys + are the names of the exported functions, and the values are docstrings for the functions. When a sequence or a + single string is given, they are the functions needed to be exported, and the docstrings are set to empty + strings. A single function name can also be given as a string. extra_cflags: Sequence[str], optional The extra compiler flags for C++ compilation. The default flags are: @@ -316,46 +320,58 @@ def load_inline( The extra include paths. The default include paths are: - The include path of tvm ffi + build_directory: str, optional + The build directory. If not specified, a default tvm ffi cache directory will be used. By default, the + cache directory is `~/.cache/tvm-ffi`. You can also set the `TVM_FFI_CACHE_DIR` environment variable to + specify the cache directory. + Returns ------- mod: Module The loaded tvm ffi module. """ - if cpp_source is None: - cpp_source = "" - if cuda_source is None: - cuda_source = "" - if cpp_functions is None: - cpp_functions = {} - if cuda_functions is None: - cuda_functions = {} + if cpp_sources is None: + cpp_sources = [] + elif isinstance(cpp_sources, str): + cpp_sources = [cpp_sources] + cpp_source = "\n".join(cpp_sources) + if cuda_sources is None: + cuda_sources = [] + elif isinstance(cuda_sources, str): + cuda_sources = [cuda_sources] + cuda_source = "\n".join(cuda_sources) + with_cuda = len(cuda_sources) > 0 + extra_ldflags = extra_ldflags or [] extra_cflags = extra_cflags or [] extra_cuda_cflags = extra_cuda_cflags or [] extra_include_paths = extra_include_paths or [] - # whether we have cuda source in this module - with_cuda = len(cuda_source.strip()) > 0 - # add function registration code to sources - cpp_source = _decorate_with_tvm_ffi(cpp_source, cpp_functions) - cuda_source = _decorate_with_tvm_ffi(cuda_source, cuda_functions) + if isinstance(functions, str): + functions = {functions: ""} + elif isinstance(functions, Sequence): + functions = {name: "" for name in functions} + cpp_source = _decorate_with_tvm_ffi(cpp_source, functions) + cuda_source = _decorate_with_tvm_ffi(cuda_source, {}) # determine the cache dir for the built module - cache_dir = os.path.join( - os.environ.get("TVM_FFI_CACHE_DIR", os.path.expanduser("~/.cache/tvm-ffi")) - ) - source_hash: str = _hash_sources( - cpp_source, - cuda_source, - cpp_functions, - cuda_functions, - extra_cflags, - extra_cuda_cflags, - extra_ldflags, - extra_include_paths, - ) - build_dir: str = os.path.join(cache_dir, "{}_{}".format(name, source_hash)) + if build_directory is None: + build_directory = os.environ.get( + "TVM_FFI_CACHE_DIR", os.path.expanduser("~/.cache/tvm-ffi") + ) + source_hash: str = _hash_sources( + cpp_source, + cuda_source, + functions, + extra_cflags, + extra_cuda_cflags, + extra_ldflags, + extra_include_paths, + ) + build_dir: str = os.path.join(build_directory, "{}_{}".format(name, source_hash)) + else: + build_dir = os.path.abspath(build_directory) os.makedirs(build_dir, exist_ok=True) # generate build.ninja diff --git a/ffi/tests/python/test_load_inline.py b/ffi/tests/python/test_load_inline.py index bb14ae9792c2..f51eacd5c026 100644 --- a/ffi/tests/python/test_load_inline.py +++ b/ffi/tests/python/test_load_inline.py @@ -30,8 +30,8 @@ def test_load_inline_cpp(): mod: Module = tvm_ffi.cpp.load_inline( name="hello", - cpp_source=r""" - void AddOne(DLTensor* x, DLTensor* y) { + cpp_sources=r""" + void add_one_cpu(DLTensor* x, DLTensor* y) { // implementation of a library function TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; DLDataType f32_dtype{kDLFloat, 32, 1}; @@ -44,7 +44,7 @@ def test_load_inline_cpp(): } } """, - cpp_functions={"add_one_cpu": "AddOne"}, + functions=["add_one_cpu"], ) x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32) @@ -53,11 +53,111 @@ def test_load_inline_cpp(): numpy.testing.assert_equal(x + 1, y) -@pytest.mark.skip(reason="Requires CUDA") +def test_load_inline_cpp_with_docstrings(): + mod: Module = tvm_ffi.cpp.load_inline( + name="hello", + cpp_sources=r""" + void add_one_cpu(DLTensor* x, DLTensor* y) { + // implementation of a library function + TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; + DLDataType f32_dtype{kDLFloat, 32, 1}; + TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor"; + TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor"; + TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor"; + TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape"; + for (int i = 0; i < x->shape[0]; ++i) { + static_cast(y->data)[i] = static_cast(x->data)[i] + 1; + } + } + """, + functions={"add_one_cpu": "add two float32 1D tensors element-wise"}, + ) + + x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32) + y = numpy.empty_like(x) + mod.add_one_cpu(x, y) + numpy.testing.assert_equal(x + 1, y) + + +def test_load_inline_cpp_multiple_sources(): + mod: Module = tvm_ffi.cpp.load_inline( + name="hello", + cpp_sources=[ + r""" + void add_one_cpu(DLTensor* x, DLTensor* y) { + // implementation of a library function + TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; + DLDataType f32_dtype{kDLFloat, 32, 1}; + TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor"; + TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor"; + TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor"; + TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape"; + for (int i = 0; i < x->shape[0]; ++i) { + static_cast(y->data)[i] = static_cast(x->data)[i] + 1; + } + } + """, + r""" + void add_two_cpu(DLTensor* x, DLTensor* y) { + // implementation of a library function + TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; + DLDataType f32_dtype{kDLFloat, 32, 1}; + TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor"; + TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor"; + TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor"; + TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape"; + for (int i = 0; i < x->shape[0]; ++i) { + static_cast(y->data)[i] = static_cast(x->data)[i] + 2; + } + } + """, + ], + functions=["add_one_cpu", "add_two_cpu"], + ) + + x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32) + y = numpy.empty_like(x) + mod.add_one_cpu(x, y) + numpy.testing.assert_equal(x + 1, y) + + +def test_load_inline_cpp_build_dir(): + mod: Module = tvm_ffi.cpp.load_inline( + name="hello", + cpp_sources=r""" + void add_one_cpu(DLTensor* x, DLTensor* y) { + // implementation of a library function + TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; + DLDataType f32_dtype{kDLFloat, 32, 1}; + TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor"; + TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor"; + TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor"; + TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape"; + for (int i = 0; i < x->shape[0]; ++i) { + static_cast(y->data)[i] = static_cast(x->data)[i] + 1; + } + } + """, + functions=["add_one_cpu"], + build_directory="./build_add_one", + ) + + x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32) + y = numpy.empty_like(x) + mod.add_one_cpu(x, y) + numpy.testing.assert_equal(x + 1, y) + + +@pytest.mark.skipif( + torch is None or not torch.cuda.is_available(), reason="Requires torch and CUDA" +) def test_load_inline_cuda(): mod: Module = tvm_ffi.cpp.load_inline( name="hello", - cuda_source=r""" + cpp_sources=r""" + void add_one_cuda(DLTensor* x, DLTensor* y); + """, + cuda_sources=r""" __global__ void AddOneKernel(float* x, float* y, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { @@ -65,7 +165,7 @@ def test_load_inline_cuda(): } } - void AddOneCUDA(DLTensor* x, DLTensor* y) { + void add_one_cuda(DLTensor* x, DLTensor* y) { // implementation of a library function TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; DLDataType f32_dtype{kDLFloat, 32, 1}; @@ -87,7 +187,7 @@ def test_load_inline_cuda(): static_cast(y->data), n); } """, - cuda_functions={"add_one_cuda": "AddOneCUDA"}, + functions=["add_one_cuda"], ) if torch is not None: @@ -97,12 +197,14 @@ def test_load_inline_cuda(): torch.testing.assert_close(x_cuda + 1, y_cuda) -@pytest.mark.skip(reason="Requires CUDA") +@pytest.mark.skipif( + torch is None or not torch.cuda.is_available(), reason="Requires torch and CUDA" +) def test_load_inline_both(): mod: Module = tvm_ffi.cpp.load_inline( name="hello", - cpp_source=r""" - void AddOne(DLTensor* x, DLTensor* y) { + cpp_sources=r""" + void add_one_cpu(DLTensor* x, DLTensor* y) { // implementation of a library function TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; DLDataType f32_dtype{kDLFloat, 32, 1}; @@ -114,8 +216,10 @@ def test_load_inline_both(): static_cast(y->data)[i] = static_cast(x->data)[i] + 1; } } + + void add_one_cuda(DLTensor* x, DLTensor* y); """, - cuda_source=r""" + cuda_sources=r""" __global__ void AddOneKernel(float* x, float* y, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { @@ -123,7 +227,7 @@ def test_load_inline_both(): } } - void AddOneCUDA(DLTensor* x, DLTensor* y) { + void add_one_cuda(DLTensor* x, DLTensor* y) { // implementation of a library function TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; DLDataType f32_dtype{kDLFloat, 32, 1}; @@ -145,8 +249,7 @@ def test_load_inline_both(): static_cast(y->data), n); } """, - cpp_functions={"add_one_cpu": "AddOne"}, - cuda_functions={"add_one_cuda": "AddOneCUDA"}, + functions=["add_one_cpu", "add_one_cuda"], ) x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32) @@ -154,8 +257,7 @@ def test_load_inline_both(): mod.add_one_cpu(x, y) numpy.testing.assert_equal(x + 1, y) - if torch is not None: - x_cuda = torch.asarray([1, 2, 3, 4, 5], dtype=torch.float32, device="cuda") - y_cuda = torch.empty_like(x_cuda) - mod.add_one_cuda(x_cuda, y_cuda) - torch.testing.assert_close(x_cuda + 1, y_cuda) + x_cuda = torch.asarray([1, 2, 3, 4, 5], dtype=torch.float32, device="cuda") + y_cuda = torch.empty_like(x_cuda) + mod.add_one_cuda(x_cuda, y_cuda) + torch.testing.assert_close(x_cuda + 1, y_cuda) From 40ff65d9797590e6044fe7784d345169546e1c02 Mon Sep 17 00:00:00 2001 From: Yaoyao Ding Date: Sat, 6 Sep 2025 13:03:29 -0400 Subject: [PATCH 2/3] update docs --- ffi/python/tvm_ffi/cpp/load_inline.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ffi/python/tvm_ffi/cpp/load_inline.py b/ffi/python/tvm_ffi/cpp/load_inline.py index f655250aa440..61b3a74fce2c 100644 --- a/ffi/python/tvm_ffi/cpp/load_inline.py +++ b/ffi/python/tvm_ffi/cpp/load_inline.py @@ -265,15 +265,15 @@ def load_inline( ) -> Module: """Compile and load a C++/CUDA tvm ffi module from inline source code. - This function compiles the given C++ and/or CUDA source code into a shared library. Both cpp_source and cuda_source - are compiled to an object file, and then linked together into a shared library. It's possible to only provide - cpp_source or cuda_source. - - The `cpp_functions` and `cuda_functions` parameters are used to specify which functions in the source code - should be exported to the tvm ffi module. The keys of the mapping are the names of the exported functions, and the - values are the names of the functions in the source code. The exported name and the function name in the source code - must be different. The exported name must be a valid C identifier while the function name in the source code can - contain namespace qualifiers. + This function compiles the given C++ and/or CUDA source code into a shared library. Both cpp_sources and + cuda_sources are compiled to an object file, and then linked together into a shared library. It's possible to only + provide cpp_sources or cuda_sources. + + The `functions` parameter is used to specify which functions in the source code should be exported to the tvm ffi module. + It can be a mapping, a sequence, or a single string. When a mapping is given, the keys are the names of the exported + functions, and the values are docstrings for the functions. When a sequence or a single string is given, they are the + functions needed to be exported, and the docstrings are set to empty strings. A single function name can also be given + as a string, indicating that only one function is to be exported. Extra compiler and linker flags can be provided via the `extra_cflags`, `extra_cuda_cflags`, and `extra_ldflags` parameters. The default flags are generally sufficient for most use cases, but you may need to provide additional From 65381b06fd1ea85579e9679ac08f842045a71936 Mon Sep 17 00:00:00 2001 From: Yaoyao Ding Date: Sat, 6 Sep 2025 14:14:33 -0400 Subject: [PATCH 3/3] remove whitespace trailing --- ffi/examples/inline_module/main.py | 2 +- ffi/tests/python/test_load_inline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ffi/examples/inline_module/main.py b/ffi/examples/inline_module/main.py index 7a0700b55850..b55574ae7bab 100644 --- a/ffi/examples/inline_module/main.py +++ b/ffi/examples/inline_module/main.py @@ -36,7 +36,7 @@ def main(): static_cast(y->data)[i] = static_cast(x->data)[i] + 1; } } - + void add_one_cuda(DLTensor* x, DLTensor* y); """, cuda_sources=r""" diff --git a/ffi/tests/python/test_load_inline.py b/ffi/tests/python/test_load_inline.py index f51eacd5c026..f809cede5927 100644 --- a/ffi/tests/python/test_load_inline.py +++ b/ffi/tests/python/test_load_inline.py @@ -216,7 +216,7 @@ def test_load_inline_both(): static_cast(y->data)[i] = static_cast(x->data)[i] + 1; } } - + void add_one_cuda(DLTensor* x, DLTensor* y); """, cuda_sources=r"""