@@ -54,20 +54,29 @@ import numpy as np
5454```
5555
5656Error checking is a fundamental best practice in code development and a code
57- example is provided. For brevity, error checking within the example is omitted.
57+ example is provided.
5858In a future release, this may automatically raise exceptions using a Python
5959object model.
6060
6161```{code-cell} python
62- def ASSERT_DRV(err):
63- if isinstance(err, cuda.CUresult):
64- if err != cuda.CUresult.CUDA_SUCCESS:
65- raise RuntimeError("Cuda Error: {}".format(err))
66- elif isinstance(err, nvrtc.nvrtcResult):
67- if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
68- raise RuntimeError("Nvrtc Error: {}".format(err))
62+ def _cudaGetErrorEnum(error):
63+ if isinstance(error, cuda.CUresult):
64+ err, name = cuda.cuGetErrorName(error)
65+ return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
66+ elif isinstance(error, nvrtc.nvrtcResult):
67+ return nvrtc.nvrtcGetErrorString(error)[1]
6968 else:
70- raise RuntimeError("Unknown error type: {}".format(err))
69+ raise RuntimeError('Unknown error type: {}'.format(error))
70+
71+ def checkCudaErrors(result):
72+ if result[0].value:
73+ raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
74+ if len(result) == 1:
75+ return None
76+ elif len(result) == 2:
77+ return result[1]
78+ else:
79+ return result[1:]
7180```
7281
7382It’s common practice to write CUDA kernels near the top of a translation unit,
@@ -95,40 +104,43 @@ Go ahead and compile the kernel into PTX. Remember that this is executed at runt
95104- Compile the program.
96105- Extract PTX from the compiled program.
97106
98- In the following code example, compilation is targeting compute capability 75,
99- or Turing architecture, with FMAD enabled. If compilation fails, use
100- `nvrtcGetProgramLog` to retrieve a compile log for additional information .
107+ In the following code example, the Driver API is initialized so that the NVIDIA driver
108+ and GPU are accessible. Next, the GPU is queried for their compute capability. Finally,
109+ the program is compiled to target our local compute capability architecture with FMAD enabled .
101110
102111```{code-cell} python
112+ # Initialize CUDA Driver API
113+ checkCudaErrors(cuda.cuInit(0))
114+
115+ # Retrieve handle for device 0
116+ cuDevice = checkCudaErrors(cuda.cuDeviceGet(0))
117+
118+ # Derive target architecture for device 0
119+ major = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice))
120+ minor = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice))
121+ arch_arg = bytes(f'--gpu-architecture=compute_{major}{minor}', 'ascii')
122+
103123# Create program
104- err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])
124+ prog = checkCudaErrors( nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], []) )
105125
106126# Compile program
107- opts = [b"--fmad=false", b"--gpu-architecture=compute_75" ]
108- err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
127+ opts = [b"--fmad=false", arch_arg ]
128+ checkCudaErrors( nvrtc.nvrtcCompileProgram(prog, 2, opts) )
109129
110130# Get PTX from compilation
111- err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
131+ ptxSize = checkCudaErrors( nvrtc.nvrtcGetPTXSize(prog) )
112132ptx = b" " * ptxSize
113- err, = nvrtc.nvrtcGetPTX(prog, ptx)
133+ checkCudaErrors( nvrtc.nvrtcGetPTX(prog, ptx) )
114134```
115135
116136Before you can use the PTX or do any work on the GPU, you must create a CUDA
117137context. CUDA contexts are analogous to host processes for the device. In the
118- following code example, the Driver API is initialized so that the NVIDIA driver
119- and GPU are accessible. Next, a handle for compute device 0 is passed to
120- `cuCtxCreate` to designate that GPU for context creation. With the context
121- created, you can proceed in compiling the CUDA kernel using NVRTC.
138+ following code example, a handle for compute device 0 is passed to
139+ `cuCtxCreate` to designate that GPU for context creation.
122140
123141```{code-cell} python
124- # Initialize CUDA Driver API
125- err, = cuda.cuInit(0)
126-
127- # Retrieve handle for device 0
128- err, cuDevice = cuda.cuDeviceGet(0)
129-
130142# Create context
131- err, context = cuda.cuCtxCreate(0, cuDevice)
143+ context = checkCudaErrors( cuda.cuCtxCreate(0, cuDevice) )
132144```
133145
134146With a CUDA context created on device 0, load the PTX generated earlier into a
@@ -140,10 +152,8 @@ After loading into the module, extract a specific kernel with
140152# Load PTX as module data and retrieve function
141153ptx = np.char.array(ptx)
142154# Note: Incompatible --gpu-architecture would be detected here
143- err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
144- ASSERT_DRV(err)
145- err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
146- ASSERT_DRV(err)
155+ module = checkCudaErrors(cuda.cuModuleLoadData(ptx.ctypes.data))
156+ kernel = checkCudaErrors(cuda.cuModuleGetFunction(module, b"saxpy"))
147157```
148158
149159Next, get all your data prepared and transferred to the GPU. For increased
@@ -175,18 +185,18 @@ Python doesn’t have a natural concept of pointers, yet `cuMemcpyHtoDAsync` exp
175185XX.
176186
177187```{code-cell} python
178- err, dXclass = cuda.cuMemAlloc(bufferSize)
179- err, dYclass = cuda.cuMemAlloc(bufferSize)
180- err, dOutclass = cuda.cuMemAlloc(bufferSize)
188+ dXclass = checkCudaErrors( cuda.cuMemAlloc(bufferSize) )
189+ dYclass = checkCudaErrors( cuda.cuMemAlloc(bufferSize) )
190+ dOutclass = checkCudaErrors( cuda.cuMemAlloc(bufferSize) )
181191
182- err, stream = cuda.cuStreamCreate(0)
192+ stream = checkCudaErrors( cuda.cuStreamCreate(0) )
183193
184- err, = cuda.cuMemcpyHtoDAsync(
194+ checkCudaErrors( cuda.cuMemcpyHtoDAsync(
185195 dXclass, hX.ctypes.data, bufferSize, stream
186- )
187- err, = cuda.cuMemcpyHtoDAsync(
196+ ))
197+ checkCudaErrors( cuda.cuMemcpyHtoDAsync(
188198 dYclass, hY.ctypes.data, bufferSize, stream
189- )
199+ ))
190200```
191201
192202With data prep and resources allocation finished, the kernel is ready to be
@@ -213,7 +223,7 @@ args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
213223Now the kernel can be launched:
214224
215225```{code-cell} python
216- err, = cuda.cuLaunchKernel(
226+ checkCudaErrors( cuda.cuLaunchKernel(
217227 kernel,
218228 NUM_BLOCKS, # grid x dim
219229 1, # grid y dim
@@ -225,12 +235,12 @@ err, = cuda.cuLaunchKernel(
225235 stream, # stream
226236 args.ctypes.data, # kernel arguments
227237 0, # extra (ignore)
228- )
238+ ))
229239
230- err, = cuda.cuMemcpyDtoHAsync(
240+ checkCudaErrors( cuda.cuMemcpyDtoHAsync(
231241 hOut.ctypes.data, dOutclass, bufferSize, stream
232- )
233- err, = cuda.cuStreamSynchronize(stream)
242+ ))
243+ checkCudaErrors( cuda.cuStreamSynchronize(stream) )
234244```
235245
236246The `cuLaunchKernel` function takes the compiled module kernel and execution
@@ -252,12 +262,12 @@ Perform verification of the data to ensure correctness and finish the code with
252262memory clean up.
253263
254264```{code-cell} python
255- err, = cuda.cuStreamDestroy(stream)
256- err, = cuda.cuMemFree(dXclass)
257- err, = cuda.cuMemFree(dYclass)
258- err, = cuda.cuMemFree(dOutclass)
259- err, = cuda.cuModuleUnload(module)
260- err, = cuda.cuCtxDestroy(context)
265+ checkCudaErrors( cuda.cuStreamDestroy(stream) )
266+ checkCudaErrors( cuda.cuMemFree(dXclass) )
267+ checkCudaErrors( cuda.cuMemFree(dYclass) )
268+ checkCudaErrors( cuda.cuMemFree(dOutclass) )
269+ checkCudaErrors( cuda.cuModuleUnload(module) )
270+ checkCudaErrors( cuda.cuCtxDestroy(context) )
261271```
262272
263273## Performance
0 commit comments