@@ -54,20 +54,29 @@ import numpy as np
54
54
```
55
55
56
56
Error checking is a fundamental best practice in code development and a code
57
- example is provided. For brevity, error checking within the example is omitted.
57
+ example is provided.
58
58
In a future release, this may automatically raise exceptions using a Python
59
59
object model.
60
60
61
61
```{code-cell} python
62
- def ASSERT_DRV(err):
63
- if isinstance(err, cuda.CUresult):
64
- if err != cuda.CUresult.CUDA_SUCCESS:
65
- raise RuntimeError("Cuda Error: {}".format(err))
66
- elif isinstance(err, nvrtc.nvrtcResult):
67
- if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
68
- raise RuntimeError("Nvrtc Error: {}".format(err))
62
+ def _cudaGetErrorEnum(error):
63
+ if isinstance(error, cuda.CUresult):
64
+ err, name = cuda.cuGetErrorName(error)
65
+ return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
66
+ elif isinstance(error, nvrtc.nvrtcResult):
67
+ return nvrtc.nvrtcGetErrorString(error)[1]
69
68
else:
70
- raise RuntimeError("Unknown error type: {}".format(err))
69
+ raise RuntimeError('Unknown error type: {}'.format(error))
70
+
71
+ def checkCudaErrors(result):
72
+ if result[0].value:
73
+ raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
74
+ if len(result) == 1:
75
+ return None
76
+ elif len(result) == 2:
77
+ return result[1]
78
+ else:
79
+ return result[1:]
71
80
```
72
81
73
82
It’s common practice to write CUDA kernels near the top of a translation unit,
@@ -95,40 +104,43 @@ Go ahead and compile the kernel into PTX. Remember that this is executed at runt
95
104
- Compile the program.
96
105
- Extract PTX from the compiled program.
97
106
98
- In the following code example, compilation is targeting compute capability 75,
99
- or Turing architecture, with FMAD enabled. If compilation fails, use
100
- `nvrtcGetProgramLog` to retrieve a compile log for additional information .
107
+ In the following code example, the Driver API is initialized so that the NVIDIA driver
108
+ and GPU are accessible. Next, the GPU is queried for their compute capability. Finally,
109
+ the program is compiled to target our local compute capability architecture with FMAD enabled .
101
110
102
111
```{code-cell} python
112
+ # Initialize CUDA Driver API
113
+ checkCudaErrors(cuda.cuInit(0))
114
+
115
+ # Retrieve handle for device 0
116
+ cuDevice = checkCudaErrors(cuda.cuDeviceGet(0))
117
+
118
+ # Derive target architecture for device 0
119
+ major = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice))
120
+ minor = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice))
121
+ arch_arg = bytes(f'--gpu-architecture=compute_{major}{minor}', 'ascii')
122
+
103
123
# Create program
104
- err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])
124
+ prog = checkCudaErrors( nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], []) )
105
125
106
126
# Compile program
107
- opts = [b"--fmad=false", b"--gpu-architecture=compute_75" ]
108
- err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
127
+ opts = [b"--fmad=false", arch_arg ]
128
+ checkCudaErrors( nvrtc.nvrtcCompileProgram(prog, 2, opts) )
109
129
110
130
# Get PTX from compilation
111
- err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
131
+ ptxSize = checkCudaErrors( nvrtc.nvrtcGetPTXSize(prog) )
112
132
ptx = b" " * ptxSize
113
- err, = nvrtc.nvrtcGetPTX(prog, ptx)
133
+ checkCudaErrors( nvrtc.nvrtcGetPTX(prog, ptx) )
114
134
```
115
135
116
136
Before you can use the PTX or do any work on the GPU, you must create a CUDA
117
137
context. CUDA contexts are analogous to host processes for the device. In the
118
- following code example, the Driver API is initialized so that the NVIDIA driver
119
- and GPU are accessible. Next, a handle for compute device 0 is passed to
120
- `cuCtxCreate` to designate that GPU for context creation. With the context
121
- created, you can proceed in compiling the CUDA kernel using NVRTC.
138
+ following code example, a handle for compute device 0 is passed to
139
+ `cuCtxCreate` to designate that GPU for context creation.
122
140
123
141
```{code-cell} python
124
- # Initialize CUDA Driver API
125
- err, = cuda.cuInit(0)
126
-
127
- # Retrieve handle for device 0
128
- err, cuDevice = cuda.cuDeviceGet(0)
129
-
130
142
# Create context
131
- err, context = cuda.cuCtxCreate(0, cuDevice)
143
+ context = checkCudaErrors( cuda.cuCtxCreate(0, cuDevice) )
132
144
```
133
145
134
146
With a CUDA context created on device 0, load the PTX generated earlier into a
@@ -140,10 +152,8 @@ After loading into the module, extract a specific kernel with
140
152
# Load PTX as module data and retrieve function
141
153
ptx = np.char.array(ptx)
142
154
# Note: Incompatible --gpu-architecture would be detected here
143
- err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
144
- ASSERT_DRV(err)
145
- err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
146
- ASSERT_DRV(err)
155
+ module = checkCudaErrors(cuda.cuModuleLoadData(ptx.ctypes.data))
156
+ kernel = checkCudaErrors(cuda.cuModuleGetFunction(module, b"saxpy"))
147
157
```
148
158
149
159
Next, get all your data prepared and transferred to the GPU. For increased
@@ -175,18 +185,18 @@ Python doesn’t have a natural concept of pointers, yet `cuMemcpyHtoDAsync` exp
175
185
XX.
176
186
177
187
```{code-cell} python
178
- err, dXclass = cuda.cuMemAlloc(bufferSize)
179
- err, dYclass = cuda.cuMemAlloc(bufferSize)
180
- err, dOutclass = cuda.cuMemAlloc(bufferSize)
188
+ dXclass = checkCudaErrors( cuda.cuMemAlloc(bufferSize) )
189
+ dYclass = checkCudaErrors( cuda.cuMemAlloc(bufferSize) )
190
+ dOutclass = checkCudaErrors( cuda.cuMemAlloc(bufferSize) )
181
191
182
- err, stream = cuda.cuStreamCreate(0)
192
+ stream = checkCudaErrors( cuda.cuStreamCreate(0) )
183
193
184
- err, = cuda.cuMemcpyHtoDAsync(
194
+ checkCudaErrors( cuda.cuMemcpyHtoDAsync(
185
195
dXclass, hX.ctypes.data, bufferSize, stream
186
- )
187
- err, = cuda.cuMemcpyHtoDAsync(
196
+ ))
197
+ checkCudaErrors( cuda.cuMemcpyHtoDAsync(
188
198
dYclass, hY.ctypes.data, bufferSize, stream
189
- )
199
+ ))
190
200
```
191
201
192
202
With data prep and resources allocation finished, the kernel is ready to be
@@ -213,7 +223,7 @@ args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
213
223
Now the kernel can be launched:
214
224
215
225
```{code-cell} python
216
- err, = cuda.cuLaunchKernel(
226
+ checkCudaErrors( cuda.cuLaunchKernel(
217
227
kernel,
218
228
NUM_BLOCKS, # grid x dim
219
229
1, # grid y dim
@@ -225,12 +235,12 @@ err, = cuda.cuLaunchKernel(
225
235
stream, # stream
226
236
args.ctypes.data, # kernel arguments
227
237
0, # extra (ignore)
228
- )
238
+ ))
229
239
230
- err, = cuda.cuMemcpyDtoHAsync(
240
+ checkCudaErrors( cuda.cuMemcpyDtoHAsync(
231
241
hOut.ctypes.data, dOutclass, bufferSize, stream
232
- )
233
- err, = cuda.cuStreamSynchronize(stream)
242
+ ))
243
+ checkCudaErrors( cuda.cuStreamSynchronize(stream) )
234
244
```
235
245
236
246
The `cuLaunchKernel` function takes the compiled module kernel and execution
@@ -252,12 +262,12 @@ Perform verification of the data to ensure correctness and finish the code with
252
262
memory clean up.
253
263
254
264
```{code-cell} python
255
- err, = cuda.cuStreamDestroy(stream)
256
- err, = cuda.cuMemFree(dXclass)
257
- err, = cuda.cuMemFree(dYclass)
258
- err, = cuda.cuMemFree(dOutclass)
259
- err, = cuda.cuModuleUnload(module)
260
- err, = cuda.cuCtxDestroy(context)
265
+ checkCudaErrors( cuda.cuStreamDestroy(stream) )
266
+ checkCudaErrors( cuda.cuMemFree(dXclass) )
267
+ checkCudaErrors( cuda.cuMemFree(dYclass) )
268
+ checkCudaErrors( cuda.cuMemFree(dOutclass) )
269
+ checkCudaErrors( cuda.cuModuleUnload(module) )
270
+ checkCudaErrors( cuda.cuCtxDestroy(context) )
261
271
```
262
272
263
273
## Performance
0 commit comments