NVIDIA · leofang · Mar 3, 2025 · Dec 19, 2024 · Jan 1, 2025 · Jan 16, 2025
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
@@ -45,9 +45,6 @@ class LaunchConfig:
         Group of threads (Thread Block) that will execute on the same
         streaming multiprocessor (SM). Threads within a thread blocks have
         access to shared memory and can be explicitly synchronized.
-    stream : :obj:`~_stream.Stream`
-        The stream establishing the stream ordering semantic of a
-        launch.
     shmem_size : int, optional
         Dynamic shared-memory size per thread block in bytes.
         (Default to size 0)
@@ -58,7 +55,6 @@ class LaunchConfig:
     grid: Union[tuple, int] = None
     cluster: Union[tuple, int] = None
     block: Union[tuple, int] = None
-    stream: Stream = None
     shmem_size: Optional[int] = None
 
     def __post_init__(self):
@@ -72,12 +68,6 @@ def __post_init__(self):
             if Device().compute_capability < (9, 0):
                 raise CUDAError("thread block clusters are not supported on devices with compute capability < 9.0")
             self.cluster = self._cast_to_3_tuple(self.cluster)
-        # we handle "stream=None" in the launch API
-        if self.stream is not None and not isinstance(self.stream, Stream):
-            try:
-                self.stream = Stream._init(self.stream)
-            except Exception as e:
-                raise ValueError("stream must either be a Stream object or support __cuda_stream__") from e
         if self.shmem_size is None:
             self.shmem_size = 0
 
@@ -105,27 +95,35 @@ def _cast_to_3_tuple(self, cfg):
             raise ValueError
 
 
-def launch(kernel, config, *kernel_args):
+def launch(stream, config, kernel, *kernel_args):
     """Launches a :obj:`~_module.Kernel`
     object with launch-time configuration.
 
     Parameters
     ----------
-    kernel : :obj:`~_module.Kernel`
-        Kernel to launch.
+    stream : :obj:`~_stream.Stream`
+        The stream establishing the stream ordering semantic of a
+        launch.
     config : :obj:`~_launcher.LaunchConfig`
         Launch configurations inline with options provided by
         :obj:`~_launcher.LaunchConfig` dataclass.
+    kernel : :obj:`~_module.Kernel`
+        Kernel to launch.
     *kernel_args : Any
         Variable length argument list that is provided to the
         launching kernel.
 
     """
+    if stream is None:
+        raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__")
+    if not isinstance(stream, Stream):
+        try:
+            stream = Stream._init(stream)
+        except Exception as e:
+            raise ValueError("stream must either be a Stream object or support __cuda_stream__") from e
     if not isinstance(kernel, Kernel):
         raise ValueError
     config = check_or_create_options(LaunchConfig, config, "launch config")
-    if config.stream is None:
-        raise CUDAError("stream cannot be None")
 
     # TODO: can we ensure kernel_args is valid/safe to use here?
     # TODO: merge with HelperKernelParams?
@@ -141,7 +139,7 @@ def launch(kernel, config, *kernel_args):
         drv_cfg = driver.CUlaunchConfig()
         drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
         drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
-        drv_cfg.hStream = config.stream.handle
+        drv_cfg.hStream = stream.handle
         drv_cfg.sharedMemBytes = config.shmem_size
         attrs = []  # TODO: support more attributes
         if config.cluster:
@@ -157,6 +155,6 @@ def launch(kernel, config, *kernel_args):
         # TODO: check if config has any unsupported attrs
         handle_return(
             driver.cuLaunchKernel(
-                int(kernel._handle), *config.grid, *config.block, config.shmem_size, config.stream._handle, args_ptr, 0
+                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream._handle, args_ptr, 0
             )
         )
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -14,6 +14,8 @@ Highlights
 Breaking Changes
 ----------------
 
+- The ``stream`` attribute is removed from :class:`~LaunchConfig`. Instead, the :class:`Stream` object should now be directly passed to :func:`~launch` as an argument.
+- The signature for :func:`~launch` is changed by swapping positional arguments, the new signature is now ``(stream, config, kernel, *kernel_args)``
 - Change ``__cuda_stream__`` from attribute to method.
 - The :meth:`Program.compile` method no longer accepts the ``options`` argument. Instead, you can optionally pass an instance of :class:`ProgramOptions` to the constructor of :class:`Program`.
 - :meth:`Device.properties` now provides attribute getters instead of a dictionary interface.

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
@@ -78,7 +78,7 @@ def __init__(self):
         # the problem into 16x16 chunks.
         self.grid = (self.width / 16, self.height / 16, 1.0)
         self.block = (16, 16, 1)
-        self.config = LaunchConfig(grid=self.grid, block=self.block, stream=self.stream)
+        self.config = LaunchConfig(grid=self.grid, block=self.block)
 
     def link(self, user_code, target_type):
         if target_type == "ltoir":
@@ -103,7 +103,7 @@ def link(self, user_code, target_type):
         return linked_code.get_kernel("main_workflow")
 
     def run(self, kernel):
-        launch(kernel, self.config, self.buffer.data.ptr)
+        launch(self.stream, self.config, kernel, self.buffer.data.ptr)
         self.stream.sync()
 
         # Return the result as a NumPy array (on host).

diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
@@ -54,11 +54,11 @@
 # prepare launch
 block = 32
 grid = int((size + block - 1) // block)
-config = LaunchConfig(grid=grid, block=block, stream=s)
+config = LaunchConfig(grid=grid, block=block)
 ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
 
 # launch kernel on stream s
-launch(ker, config, *ker_args)
+launch(s, config, ker, *ker_args)
 s.sync()
 
 # check result
@@ -85,11 +85,11 @@
 # prepare launch
 block = 64
 grid = int((size + block - 1) // block)
-config = LaunchConfig(grid=grid, block=block, stream=s)
+config = LaunchConfig(grid=grid, block=block)
 ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
 
 # launch kernel on stream s
-launch(ker, config, *ker_args)
+launch(s, config, ker, *ker_args)
 s.sync()
 
 # check result

diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
@@ -78,8 +78,8 @@ def __cuda_stream__(self):
 # CUDA streams.
 block = 256
 grid = (size + block - 1) // block
-config0 = LaunchConfig(grid=grid, block=block, stream=stream0)
-config1 = LaunchConfig(grid=grid, block=block, stream=stream1)
+config0 = LaunchConfig(grid=grid, block=block)
+config1 = LaunchConfig(grid=grid, block=block)
 
 # Allocate memory on GPU 0
 # Note: This runs on CuPy's current stream for GPU 0
@@ -94,7 +94,7 @@ def __cuda_stream__(self):
 stream0.wait(cp_stream0)
 
 # Launch the add kernel on GPU 0 / stream 0
-launch(ker_add, config0, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
 
 # Allocate memory on GPU 1
 # Note: This runs on CuPy's current stream for GPU 1.
@@ -108,7 +108,7 @@ def __cuda_stream__(self):
 stream1.wait(cp_stream1)
 
 # Launch the subtract kernel on GPU 1 / stream 1
-launch(ker_sub, config1, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
 
 # Synchronize both GPUs are validate the results
 dev0.set_current()

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
@@ -124,8 +124,8 @@ def my_func(arr, work_stream):
     if view.is_device_accessible:
         block = 256
         grid = (size + block - 1) // block
-        config = LaunchConfig(grid=grid, block=block, stream=work_stream)
-        launch(gpu_ker, config, view.ptr, np.uint64(size))
+        config = LaunchConfig(grid=grid, block=block)
+        launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
         # Here we're being conservative and synchronize over our work stream,
         # assuming we do not know the data stream; if we know then we could
         # just order the data stream after the work stream here, e.g.

diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
@@ -56,10 +56,10 @@
 grid = 4
 cluster = 2
 block = 32
-config = LaunchConfig(grid=grid, cluster=cluster, block=block, stream=dev.default_stream)
+config = LaunchConfig(grid=grid, cluster=cluster, block=block)
 
 # launch kernel on the default stream
-launch(ker, config)
+launch(dev.default_stream, config, ker)
 dev.sync()
 
 print("done!")
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
@@ -48,10 +48,10 @@
 # prepare launch
 block = 256
 grid = (size + block - 1) // block
-config = LaunchConfig(grid=grid, block=block, stream=s)
+config = LaunchConfig(grid=grid, block=block)
 
 # launch kernel on stream s
-launch(ker, config, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
 s.sync()
 
 # check result

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -8,20 +8,18 @@
 
 import pytest
 
-from cuda.core.experimental import Device, LaunchConfig, Stream
+from cuda.core.experimental import Device, LaunchConfig, Program, launch
 
 
 def test_launch_config_init(init_cuda):
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0)
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), shmem_size=0)
     assert config.grid == (1, 1, 1)
     assert config.block == (1, 1, 1)
-    assert config.stream is None
     assert config.shmem_size == 0
 
-    config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024)
+    config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), shmem_size=1024)
     assert config.grid == (2, 2, 2)
     assert config.block == (2, 2, 2)
-    assert isinstance(config.stream, Stream)
     assert config.shmem_size == 1024
 
 
@@ -51,18 +49,30 @@ def test_launch_config_invalid_values():
         LaunchConfig(grid=(1, 1, 1), block=(0, 1))
 
 
-def test_launch_config_stream(init_cuda):
+def test_launch_config_shmem_size():
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), shmem_size=2048)
+    assert config.shmem_size == 2048
+
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1))
+    assert config.shmem_size == 0
+
+
+def test_launch_invalid_values(init_cuda):
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    mod = program.compile("cubin")
+
     stream = Device().create_stream()
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0)
-    assert config.stream == stream
+    ker = mod.get_kernel("my_kernel")
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), shmem_size=0)
 
     with pytest.raises(ValueError):
-        LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0)
+        launch(None, ker, config)
 
+    with pytest.raises(ValueError):
+        launch(stream, None, config)
 
-def test_launch_config_shmem_size():
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048)
-    assert config.shmem_size == 2048
+    with pytest.raises(ValueError):
+        launch(stream, ker, None)
 
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None)
-    assert config.shmem_size == 0
+    launch(stream, config, ker)