From c231958e8fe51f74f8573a4de7be27fd0af6b73a Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 26 Feb 2025 14:20:26 -0800
Subject: [PATCH 1/6] Add Linker example

---
 cuda_core/examples/jit_lto_fractal.py | 274 ++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 cuda_core/examples/jit_lto_fractal.py

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
new file mode 100644
index 000000000..b7123e9c3
--- /dev/null
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+# ################################################################################
+#
+# This demo aims to illustrate a couple takeaways:
+#
+#   1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together
+#   2. That linking allows for libraries to modify workflows dynamically at runtime
+#
+# This demo mimics a relationship between a library and a user. The user's sole responsability is to
+# provide device code that generates art. Where as the library is responsible for all steps involved in
+# setting up the device, launch configurations and arguments as well as linking the provided device code.
+#
+# Two algorithms are implemented:
+#   1. A Mandelbrot set
+#   2. A Julia set
+#
+# The user can choose which algorithm to use at runtime and generate the resulting image.
+#
+# ################################################################################
+
+import argparse
+import sys
+
+import cupy as cp
+
+from cuda.core.experimental import Device, LaunchConfig, Linker, LinkerOptions, Program, ProgramOptions, launch
+
+
+# ################################################################################
+#
+# This Mocklibrary is responsible for all steps involved launching the device code.
+#
+# The user is responsible for providing the device code that will be linked into the library's workflow.
+# The provided device code must contain a function with the signature `void generate_art(float* Data)`
+class MockLibrary:
+    def __init__(self):
+        # For this mock library, the main workflow is intentially kept simple by limiting itself to only calling the
+        # externally defined generate_art function. More involved libraries have the option of applying pre and post
+        # processing steps before calling user-defined device code. Conversely, these responsabilities can be reversed
+        # such that the library owns the bulk of the workflow while allowing users to provide customized pre/post
+        # processing steps.
+        code_main = r"""
+        extern __device__ void generate_art(float* Data);
+
+        extern "C"
+        __global__
+        void main_workflow(float* Data) {
+            // Preprocessing steps can be called here
+            // ...
+
+            // Call the user-defined device code
+            generate_art(Data);
+
+            // Postprocessing steps can be called here
+            // ...
+        }
+        """
+
+        # Most of the launch configurations can be preemptively done before the user provides their device code
+        # Therefore lets compile our main workflow device code now, and link the remaining pieces at a later time
+        self.arch = "".join(f"{i}" for i in Device().compute_capability)
+        self.program_options = ProgramOptions(std="c++11", arch=f"sm_{self.arch}", relocatable_device_code=True)
+        self.main_object_code = Program(code_main, "c++", options=self.program_options).compile("ptx")
+
+        # Setup device state
+        self.dev = Device()
+        self.dev.set_current()
+        self.stream = self.dev.create_stream()
+
+        # Setup buffer to store our results
+        self.width = 1024
+        self.height = 512
+        self.buffer = cp.empty(self.width * self.height * 4, dtype=cp.float32)
+
+        # Setup the launch configuration such that each thread will be generating one pixel, and subdivide
+        # the problem into 16x16 chunks.
+        self.grid = (self.width / 16, self.height / 16, 1)
+        self.block = (16, 16, 1)
+        self.config = LaunchConfig(grid=self.grid, block=self.block, stream=self.stream)
+
+    def link_and_run_code(self, user_code):
+        # First, user-defined code is compiled into a PTX object code
+        user_object_code = Program(user_code, "c++", options=self.program_options).compile("ptx")
+
+        # Then a Linker is created to link the main object code with the user-defined code
+        linker_options = LinkerOptions(arch=f"sm_{self.arch}")
+        linker = Linker(self.main_object_code, user_object_code, options=linker_options)
+        linked_code = linker.link("cubin")
+
+        # Now we're ready to retrieve the main device function and execute our library's workflow
+        kernel = linked_code.get_kernel("main_workflow")
+        launch(kernel, self.config, self.buffer.data.ptr)
+        self.stream.sync()
+
+        # Return the result buffer as a CUPY array
+        return self.buffer.get().astype(float).reshape(self.height, self.width, 4)
+
+
+# Now lets proceed with code from the user's perspective!
+#
+# ################################################################################
+
+# Simple implementation of Mandelbrot set from Wikipedia
+# http://en.wikipedia.org/wiki/Mandelbrot_set
+#
+# Note that this kernel is meant to be a simple, straight-forward
+# implementation, and so may not represent optimized GPU code.
+code_mandelbrot = r"""
+__device__
+void generate_art(float* Data) {
+    // Which pixel am I?
+    unsigned DataX = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned DataY = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned Width = gridDim.x * blockDim.x;
+    unsigned Height = gridDim.y * blockDim.y;
+
+    float R, G, B, A;
+
+    // Scale coordinates to (-2.5, 1) and (-1, 1)
+
+    float NormX = (float)DataX / (float)Width;
+    NormX *= 3.5f;
+    NormX -= 2.5f;
+
+    float NormY = (float)DataY / (float)Height;
+    NormY *= 2.0f;
+    NormY -= 1.0f;
+
+    float X0 = NormX;
+    float Y0 = NormY;
+
+    float X = 0.0f;
+    float Y = 0.0f;
+
+    unsigned Iter = 0;
+    unsigned MaxIter = 1000;
+
+    // Iterate
+    while(X*X + Y*Y < 4.0f && Iter < MaxIter) {
+        float XTemp = X*X - Y*Y + X0;
+        Y = 2.0f*X*Y + Y0;
+
+        X = XTemp;
+
+        Iter++;
+    }
+
+    unsigned ColorG = Iter % 50;
+    unsigned ColorB = Iter % 25;
+
+    R = 0.0f;
+    G = (float)ColorG / 50.0f;
+    B = (float)ColorB / 25.0f;
+    A = 1.0f;
+
+    Data[DataY*Width*4+DataX*4+0] = R;
+    Data[DataY*Width*4+DataX*4+1] = G;
+    Data[DataY*Width*4+DataX*4+2] = B;
+    Data[DataY*Width*4+DataX*4+3] = A;
+}
+"""
+
+# Simple implementation of Julia set from Wikipedia
+# http://en.wikipedia.org/wiki/Julia_set
+#
+# Note that this kernel is meant to be a simple, straight-forward
+# implementation, and so may not represent optimized GPU code.
+code_julia = r"""
+__device__
+void generate_art(float* Data) {
+    // Which pixel am I?
+    unsigned DataX = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned DataY = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned Width = gridDim.x * blockDim.x;
+    unsigned Height = gridDim.y * blockDim.y;
+
+    float R, G, B, A;
+
+    // Scale coordinates to (-2, 2) for both x and y
+    // Scale coordinates to (-2.5, 1) and (-1, 1)
+    float X = (float)DataX / (float)Width;
+    X *= 4.0f;
+    X -= 2.0f;
+
+    float Y = (float)DataY / (float)Height;
+    Y *= 2.0f;
+    Y -= 1.0f;
+
+    // Julia set uses a fixed constant C
+    float Cx = -0.8f;  // Try different values for different patterns
+    float Cy = 0.156f;   // Try different values for different patterns
+
+    unsigned Iter = 0;
+    unsigned MaxIter = 1000;
+
+    // Iterate
+    while(X*X + Y*Y < 4.0f && Iter < MaxIter) {
+        float XTemp = X*X - Y*Y + Cx;
+        Y = 2.0f*X*Y + Cy;
+        X = XTemp;
+        Iter++;
+    }
+
+    unsigned ColorG = Iter % 50;
+    unsigned ColorB = Iter % 25;
+
+    R = 0.0f;
+    G = (float)ColorG / 50.0f;
+    B = (float)ColorB / 25.0f;
+    A = 1.0f;
+
+    Data[DataY*Width*4+DataX*4+0] = R;
+    Data[DataY*Width*4+DataX*4+1] = G;
+    Data[DataY*Width*4+DataX*4+2] = B;
+    Data[DataY*Width*4+DataX*4+3] = A;
+}
+"""
+
+# Parse command line arguments
+# Two different kernels are implemented with unique algorithms, and the user can choose which one should be used
+# Both kernels fulfill the signature required by the MockLibrary: `void generate_art(float* Data)`
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--target",
+    "-t",
+    type=str,
+    default="all",
+    choices=["mandelbrot", "julia", "all"],
+    help="Type of visualization to generate (mandelbrot, julia, or all)",
+)
+parser.add_argument(
+    "--display",
+    "-d",
+    action="store_true",
+    help="Display the generated images",
+)
+args = parser.parse_args()
+
+if args.display:
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("this example requires matplotlib installed in order to display the image", file=sys.stderr)
+        sys.exit(0)
+
+result_to_display = []
+lib = MockLibrary()
+
+# Process mandelbrot option
+if args.target in ("mandelbrot", "all"):
+    # The library will compile and link their main kernel with the provided Mandelbrot kernel
+    result = lib.link_and_run_code(code_mandelbrot)
+    result_to_display.append((result, "Mandelbrot"))
+
+# Process julia option
+if args.target in ("julia", "all"):
+    # Likewise, the same library can be configured to instead use the provided Julia kernel
+    result = lib.link_and_run_code(code_julia)
+    result_to_display.append((result, "Julia"))
+
+# Display the generated images if requested
+if args.display:
+    fig = plt.figure()
+    for i, (image, title) in enumerate(result_to_display):
+        axs = fig.add_subplot(len(result_to_display), 1, i + 1)
+        axs.imshow(image)
+        axs.set_title(title)
+        axs.axis("off")
+    plt.show()
+
+print("done!")

From 0282106ab00f434d9cfc9f807cce0d1458eba9db Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 28 Feb 2025 01:38:22 +0000
Subject: [PATCH 2/6] ensure script; add ltoir support; add timing util

---
 cuda_core/examples/jit_lto_fractal.py | 166 +++++++++++++++++---------
 1 file changed, 107 insertions(+), 59 deletions(-)

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index b7123e9c3..a8c32e37f 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -81,22 +81,58 @@ def __init__(self):
         self.block = (16, 16, 1)
         self.config = LaunchConfig(grid=self.grid, block=self.block, stream=self.stream)
 
-    def link_and_run_code(self, user_code):
+    def link(self, user_code, target_type):
+        if target_type == "ltoir":
+            program_options = ProgramOptions(std="c++11", arch=f"sm_{self.arch}", link_time_optimization=True)
+            linker_options = LinkerOptions(arch=f"sm_{self.arch}", link_time_optimization=True)
+        elif target_type == "ptx":
+            program_options = self.program_options
+            linker_options = LinkerOptions(arch=f"sm_{self.arch}")
+        else:
+            raise AssertionError
+
         # First, user-defined code is compiled into a PTX object code
-        user_object_code = Program(user_code, "c++", options=self.program_options).compile("ptx")
+        user_object_code = Program(user_code, "c++", options=program_options).compile(target_type)
 
         # Then a Linker is created to link the main object code with the user-defined code
-        linker_options = LinkerOptions(arch=f"sm_{self.arch}")
         linker = Linker(self.main_object_code, user_object_code, options=linker_options)
+
+        # We emit the linked code as cubin
         linked_code = linker.link("cubin")
 
         # Now we're ready to retrieve the main device function and execute our library's workflow
-        kernel = linked_code.get_kernel("main_workflow")
+        return linked_code.get_kernel("main_workflow")
+
+    def run(self, kernel):
+        from cuda.bindings import driver
+
+        e1 = e2 = None
+        N = 10**4
+        N_warmup = 10
+        ave_time = 0.0
+        for i in range(N + N_warmup):
+            if e1 is None:
+                e1 = self.stream.record(options={"enable_timing": True})
+            else:
+                self.stream.record(e1)
+            launch(kernel, self.config, self.buffer.data.ptr)
+            if e2 is None:
+                e2 = self.stream.record(options={"enable_timing": True})
+            else:
+                self.stream.record(e2)
+            e2.sync()
+            if i < N_warmup:
+                continue
+            err, t = driver.cuEventElapsedTime(e1.handle, e2.handle)
+            assert err == 0
+            ave_time += t
+        print("timing: ", ave_time / N)
+
         launch(kernel, self.config, self.buffer.data.ptr)
         self.stream.sync()
 
-        # Return the result buffer as a CUPY array
-        return self.buffer.get().astype(float).reshape(self.height, self.width, 4)
+        # Return the result buffer as a CuPy array
+        return cp.asnumpy(self.buffer).reshape(self.height, self.width, 4)
 
 
 # Now lets proceed with code from the user's perspective!
@@ -219,56 +255,68 @@ def link_and_run_code(self, user_code):
 }
 """
 
-# Parse command line arguments
-# Two different kernels are implemented with unique algorithms, and the user can choose which one should be used
-# Both kernels fulfill the signature required by the MockLibrary: `void generate_art(float* Data)`
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--target",
-    "-t",
-    type=str,
-    default="all",
-    choices=["mandelbrot", "julia", "all"],
-    help="Type of visualization to generate (mandelbrot, julia, or all)",
-)
-parser.add_argument(
-    "--display",
-    "-d",
-    action="store_true",
-    help="Display the generated images",
-)
-args = parser.parse_args()
-
-if args.display:
-    try:
-        import matplotlib.pyplot as plt
-    except ImportError:
-        print("this example requires matplotlib installed in order to display the image", file=sys.stderr)
-        sys.exit(0)
-
-result_to_display = []
-lib = MockLibrary()
-
-# Process mandelbrot option
-if args.target in ("mandelbrot", "all"):
-    # The library will compile and link their main kernel with the provided Mandelbrot kernel
-    result = lib.link_and_run_code(code_mandelbrot)
-    result_to_display.append((result, "Mandelbrot"))
-
-# Process julia option
-if args.target in ("julia", "all"):
-    # Likewise, the same library can be configured to instead use the provided Julia kernel
-    result = lib.link_and_run_code(code_julia)
-    result_to_display.append((result, "Julia"))
-
-# Display the generated images if requested
-if args.display:
-    fig = plt.figure()
-    for i, (image, title) in enumerate(result_to_display):
-        axs = fig.add_subplot(len(result_to_display), 1, i + 1)
-        axs.imshow(image)
-        axs.set_title(title)
-        axs.axis("off")
-    plt.show()
-
-print("done!")
+
+if __name__ == "__main__":
+    # Parse command line arguments
+    # Two different kernels are implemented with unique algorithms, and the user can choose which one should be used
+    # Both kernels fulfill the signature required by the MockLibrary: `void generate_art(float* Data)`
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--target",
+        "-t",
+        type=str,
+        default="all",
+        choices=["mandelbrot", "julia", "all"],
+        help="Type of visualization to generate",
+    )
+    parser.add_argument(
+        "--format",
+        "-f",
+        type=str,
+        default="ltoir",
+        choices=["ptx", "ltoir"],
+        help="Type of intermediate format for the device functions to be linked",
+    )
+    parser.add_argument(
+        "--display",
+        "-d",
+        action="store_true",
+        help="Display the generated images",
+    )
+    args = parser.parse_args()
+
+    if args.display:
+        try:
+            import matplotlib.pyplot as plt
+        except ImportError:
+            print("this example requires matplotlib installed in order to display the image", file=sys.stderr)
+            sys.exit(0)
+
+    result_to_display = []
+    lib = MockLibrary()
+
+    # Process mandelbrot option
+    if args.target in ("mandelbrot", "all"):
+        # The library will compile and link their main kernel with the provided Mandelbrot kernel
+        kernel = lib.link(code_mandelbrot, args.format)
+        result = lib.run(kernel)
+        result_to_display.append((result, "Mandelbrot"))
+
+    # Process julia option
+    if args.target in ("julia", "all"):
+        # Likewise, the same library can be configured to instead use the provided Julia kernel
+        kernel = lib.link(code_julia, args.format)
+        result = lib.run(kernel)
+        result_to_display.append((result, "Julia"))
+
+    # Display the generated images if requested
+    if args.display:
+        fig = plt.figure()
+        for i, (image, title) in enumerate(result_to_display):
+            axs = fig.add_subplot(len(result_to_display), 1, i + 1)
+            axs.imshow(image)
+            axs.set_title(title)
+            axs.axis("off")
+        plt.show()
+
+    print("done!")

From 68c93652eb4037af9c5755745668800566413689 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 28 Feb 2025 01:39:17 +0000
Subject: [PATCH 3/6] remove timing util

---
 cuda_core/examples/jit_lto_fractal.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index a8c32e37f..a39616c70 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -104,30 +104,6 @@ def link(self, user_code, target_type):
         return linked_code.get_kernel("main_workflow")
 
     def run(self, kernel):
-        from cuda.bindings import driver
-
-        e1 = e2 = None
-        N = 10**4
-        N_warmup = 10
-        ave_time = 0.0
-        for i in range(N + N_warmup):
-            if e1 is None:
-                e1 = self.stream.record(options={"enable_timing": True})
-            else:
-                self.stream.record(e1)
-            launch(kernel, self.config, self.buffer.data.ptr)
-            if e2 is None:
-                e2 = self.stream.record(options={"enable_timing": True})
-            else:
-                self.stream.record(e2)
-            e2.sync()
-            if i < N_warmup:
-                continue
-            err, t = driver.cuEventElapsedTime(e1.handle, e2.handle)
-            assert err == 0
-            ave_time += t
-        print("timing: ", ave_time / N)
-
         launch(kernel, self.config, self.buffer.data.ptr)
         self.stream.sync()
 

From fed4cf72bd59ff5c8c838ab048811887c52dce5e Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Fri, 28 Feb 2025 10:40:33 -0800
Subject: [PATCH 4/6] Review comments

---
 cuda_core/examples/jit_lto_fractal.py | 51 +++++++++++++++------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index a39616c70..937fd4898 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -9,9 +9,9 @@
 #   1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together
 #   2. That linking allows for libraries to modify workflows dynamically at runtime
 #
-# This demo mimics a relationship between a library and a user. The user's sole responsability is to
-# provide device code that generates art. Where as the library is responsible for all steps involved in
-# setting up the device, launch configurations and arguments as well as linking the provided device code.
+# This demo mimics a relationship between a library and a user. The user's sole responsibility is to
+# provide device code that generates some art. Whereas the library is responsible for all steps involved in
+# setting up the device, launch configurations and arguments, as well as linking the provided device code.
 #
 # Two algorithms are implemented:
 #   1. A Mandelbrot set
@@ -37,9 +37,9 @@
 # The provided device code must contain a function with the signature `void generate_art(float* Data)`
 class MockLibrary:
     def __init__(self):
-        # For this mock library, the main workflow is intentially kept simple by limiting itself to only calling the
+        # For this mock library, the main workflow is intentionally kept simple by limiting itself to only calling the
         # externally defined generate_art function. More involved libraries have the option of applying pre and post
-        # processing steps before calling user-defined device code. Conversely, these responsabilities can be reversed
+        # processing steps before calling user-defined device code. Conversely, these responsibilities can be reversed
         # such that the library owns the bulk of the workflow while allowing users to provide customized pre/post
         # processing steps.
         code_main = r"""
@@ -61,8 +61,7 @@ def __init__(self):
 
         # Most of the launch configurations can be preemptively done before the user provides their device code
         # Therefore lets compile our main workflow device code now, and link the remaining pieces at a later time
-        self.arch = "".join(f"{i}" for i in Device().compute_capability)
-        self.program_options = ProgramOptions(std="c++11", arch=f"sm_{self.arch}", relocatable_device_code=True)
+        self.program_options = ProgramOptions(relocatable_device_code=True)
         self.main_object_code = Program(code_main, "c++", options=self.program_options).compile("ptx")
 
         # Setup device state
@@ -70,24 +69,24 @@ def __init__(self):
         self.dev.set_current()
         self.stream = self.dev.create_stream()
 
-        # Setup buffer to store our results
+        # Setup a buffer to store the RGBA results for the width and height specified
         self.width = 1024
         self.height = 512
         self.buffer = cp.empty(self.width * self.height * 4, dtype=cp.float32)
 
         # Setup the launch configuration such that each thread will be generating one pixel, and subdivide
         # the problem into 16x16 chunks.
-        self.grid = (self.width / 16, self.height / 16, 1)
+        self.grid = (self.width / 16, self.height / 16, 1.0)
         self.block = (16, 16, 1)
         self.config = LaunchConfig(grid=self.grid, block=self.block, stream=self.stream)
 
     def link(self, user_code, target_type):
         if target_type == "ltoir":
-            program_options = ProgramOptions(std="c++11", arch=f"sm_{self.arch}", link_time_optimization=True)
-            linker_options = LinkerOptions(arch=f"sm_{self.arch}", link_time_optimization=True)
+            program_options = ProgramOptions(link_time_optimization=True)
+            linker_options = LinkerOptions(link_time_optimization=True)
         elif target_type == "ptx":
             program_options = self.program_options
-            linker_options = LinkerOptions(arch=f"sm_{self.arch}")
+            linker_options = LinkerOptions()
         else:
             raise AssertionError
 
@@ -119,7 +118,7 @@ def run(self, kernel):
 # http://en.wikipedia.org/wiki/Mandelbrot_set
 #
 # Note that this kernel is meant to be a simple, straight-forward
-# implementation, and so may not represent optimized GPU code.
+# implementation. No attempt is made to optimize this GPU code.
 code_mandelbrot = r"""
 __device__
 void generate_art(float* Data) {
@@ -168,10 +167,11 @@ def run(self, kernel):
     B = (float)ColorB / 25.0f;
     A = 1.0f;
 
-    Data[DataY*Width*4+DataX*4+0] = R;
-    Data[DataY*Width*4+DataX*4+1] = G;
-    Data[DataY*Width*4+DataX*4+2] = B;
-    Data[DataY*Width*4+DataX*4+3] = A;
+    unsigned i = DataY*Width*4+DataX*4;
+    Data[i+0] = R;
+    Data[i+1] = G;
+    Data[i+2] = B;
+    Data[i+3] = A;
 }
 """
 
@@ -179,7 +179,7 @@ def run(self, kernel):
 # http://en.wikipedia.org/wiki/Julia_set
 #
 # Note that this kernel is meant to be a simple, straight-forward
-# implementation, and so may not represent optimized GPU code.
+# implementation. No attempt is made to optimize this GPU code.
 code_julia = r"""
 __device__
 void generate_art(float* Data) {
@@ -224,15 +224,16 @@ def run(self, kernel):
     B = (float)ColorB / 25.0f;
     A = 1.0f;
 
-    Data[DataY*Width*4+DataX*4+0] = R;
-    Data[DataY*Width*4+DataX*4+1] = G;
-    Data[DataY*Width*4+DataX*4+2] = B;
-    Data[DataY*Width*4+DataX*4+3] = A;
+    unsigned i = DataY*Width*4+DataX*4;
+    Data[i+0] = R;
+    Data[i+1] = G;
+    Data[i+2] = B;
+    Data[i+3] = A;
 }
 """
 
 
-if __name__ == "__main__":
+def main():
     # Parse command line arguments
     # Two different kernels are implemented with unique algorithms, and the user can choose which one should be used
     # Both kernels fulfill the signature required by the MockLibrary: `void generate_art(float* Data)`
@@ -296,3 +297,7 @@ def run(self, kernel):
         plt.show()
 
     print("done!")
+
+
+if __name__ == "__main__":
+    main()

From 9bdae0abf697a4addf7cfa8f1c5d138eb0655410 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Fri, 28 Feb 2025 10:45:34 -0800
Subject: [PATCH 5/6] Move done print

---
 cuda_core/examples/jit_lto_fractal.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index 937fd4898..43f49e448 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -296,8 +296,7 @@ def main():
             axs.axis("off")
         plt.show()
 
-    print("done!")
-
 
 if __name__ == "__main__":
     main()
+    print("done!")

From 3097c66b009024c987596ba54a50dad24080c08a Mon Sep 17 00:00:00 2001
From: Leo Fang <leo80042@gmail.com>
Date: Fri, 28 Feb 2025 19:46:15 -0500
Subject: [PATCH 6/6] Apply suggestions from code review

---
 cuda_core/examples/jit_lto_fractal.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index 43f49e448..ff5f4f3f7 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -88,7 +88,7 @@ def link(self, user_code, target_type):
             program_options = self.program_options
             linker_options = LinkerOptions()
         else:
-            raise AssertionError
+            raise AssertionError(f"Invalid {target_type=}")
 
         # First, user-defined code is compiled into a PTX object code
         user_object_code = Program(user_code, "c++", options=program_options).compile(target_type)
@@ -106,7 +106,7 @@ def run(self, kernel):
         launch(kernel, self.config, self.buffer.data.ptr)
         self.stream.sync()
 
-        # Return the result buffer as a CuPy array
+        # Return the result as a NumPy array (on host).
         return cp.asnumpy(self.buffer).reshape(self.height, self.width, 4)