Skip to content

Commit cd3ab45

Browse files
committed
Merge branch 'gh-pages'
2 parents 81e6f86 + 726e116 commit cd3ab45

File tree

4 files changed

+206
-177
lines changed

4 files changed

+206
-177
lines changed

docs/_sources/overview.md.txt

Lines changed: 61 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,29 @@ import numpy as np
5454
```
5555

5656
Error checking is a fundamental best practice in code development and a code
57-
example is provided. For brevity, error checking within the example is omitted.
57+
example is provided.
5858
In a future release, this may automatically raise exceptions using a Python
5959
object model.
6060

6161
```{code-cell} python
62-
def ASSERT_DRV(err):
63-
if isinstance(err, cuda.CUresult):
64-
if err != cuda.CUresult.CUDA_SUCCESS:
65-
raise RuntimeError("Cuda Error: {}".format(err))
66-
elif isinstance(err, nvrtc.nvrtcResult):
67-
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
68-
raise RuntimeError("Nvrtc Error: {}".format(err))
62+
def _cudaGetErrorEnum(error):
63+
if isinstance(error, cuda.CUresult):
64+
err, name = cuda.cuGetErrorName(error)
65+
return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
66+
elif isinstance(error, nvrtc.nvrtcResult):
67+
return nvrtc.nvrtcGetErrorString(error)[1]
6968
else:
70-
raise RuntimeError("Unknown error type: {}".format(err))
69+
raise RuntimeError('Unknown error type: {}'.format(error))
70+
71+
def checkCudaErrors(result):
72+
if result[0].value:
73+
raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
74+
if len(result) == 1:
75+
return None
76+
elif len(result) == 2:
77+
return result[1]
78+
else:
79+
return result[1:]
7180
```
7281

7382
It’s common practice to write CUDA kernels near the top of a translation unit,
@@ -95,40 +104,43 @@ Go ahead and compile the kernel into PTX. Remember that this is executed at runt
95104
- Compile the program.
96105
- Extract PTX from the compiled program.
97106

98-
In the following code example, compilation is targeting compute capability 75,
99-
or Turing architecture, with FMAD enabled. If compilation fails, use
100-
`nvrtcGetProgramLog` to retrieve a compile log for additional information.
107+
In the following code example, the Driver API is initialized so that the NVIDIA driver
108+
and GPU are accessible. Next, the GPU is queried for their compute capability. Finally,
109+
the program is compiled to target our local compute capability architecture with FMAD enabled.
101110

102111
```{code-cell} python
112+
# Initialize CUDA Driver API
113+
checkCudaErrors(cuda.cuInit(0))
114+
115+
# Retrieve handle for device 0
116+
cuDevice = checkCudaErrors(cuda.cuDeviceGet(0))
117+
118+
# Derive target architecture for device 0
119+
major = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice))
120+
minor = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice))
121+
arch_arg = bytes(f'--gpu-architecture=compute_{major}{minor}', 'ascii')
122+
103123
# Create program
104-
err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])
124+
prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], []))
105125

106126
# Compile program
107-
opts = [b"--fmad=false", b"--gpu-architecture=compute_75"]
108-
err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
127+
opts = [b"--fmad=false", arch_arg]
128+
checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, 2, opts))
109129

110130
# Get PTX from compilation
111-
err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
131+
ptxSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog))
112132
ptx = b" " * ptxSize
113-
err, = nvrtc.nvrtcGetPTX(prog, ptx)
133+
checkCudaErrors(nvrtc.nvrtcGetPTX(prog, ptx))
114134
```
115135

116136
Before you can use the PTX or do any work on the GPU, you must create a CUDA
117137
context. CUDA contexts are analogous to host processes for the device. In the
118-
following code example, the Driver API is initialized so that the NVIDIA driver
119-
and GPU are accessible. Next, a handle for compute device 0 is passed to
120-
`cuCtxCreate` to designate that GPU for context creation. With the context
121-
created, you can proceed in compiling the CUDA kernel using NVRTC.
138+
following code example, a handle for compute device 0 is passed to
139+
`cuCtxCreate` to designate that GPU for context creation.
122140

123141
```{code-cell} python
124-
# Initialize CUDA Driver API
125-
err, = cuda.cuInit(0)
126-
127-
# Retrieve handle for device 0
128-
err, cuDevice = cuda.cuDeviceGet(0)
129-
130142
# Create context
131-
err, context = cuda.cuCtxCreate(0, cuDevice)
143+
context = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
132144
```
133145

134146
With a CUDA context created on device 0, load the PTX generated earlier into a
@@ -140,10 +152,8 @@ After loading into the module, extract a specific kernel with
140152
# Load PTX as module data and retrieve function
141153
ptx = np.char.array(ptx)
142154
# Note: Incompatible --gpu-architecture would be detected here
143-
err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
144-
ASSERT_DRV(err)
145-
err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
146-
ASSERT_DRV(err)
155+
module = checkCudaErrors(cuda.cuModuleLoadData(ptx.ctypes.data))
156+
kernel = checkCudaErrors(cuda.cuModuleGetFunction(module, b"saxpy"))
147157
```
148158

149159
Next, get all your data prepared and transferred to the GPU. For increased
@@ -175,18 +185,18 @@ Python doesn’t have a natural concept of pointers, yet `cuMemcpyHtoDAsync` exp
175185
XX.
176186

177187
```{code-cell} python
178-
err, dXclass = cuda.cuMemAlloc(bufferSize)
179-
err, dYclass = cuda.cuMemAlloc(bufferSize)
180-
err, dOutclass = cuda.cuMemAlloc(bufferSize)
188+
dXclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
189+
dYclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
190+
dOutclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
181191

182-
err, stream = cuda.cuStreamCreate(0)
192+
stream = checkCudaErrors(cuda.cuStreamCreate(0))
183193

184-
err, = cuda.cuMemcpyHtoDAsync(
194+
checkCudaErrors(cuda.cuMemcpyHtoDAsync(
185195
dXclass, hX.ctypes.data, bufferSize, stream
186-
)
187-
err, = cuda.cuMemcpyHtoDAsync(
196+
))
197+
checkCudaErrors(cuda.cuMemcpyHtoDAsync(
188198
dYclass, hY.ctypes.data, bufferSize, stream
189-
)
199+
))
190200
```
191201

192202
With data prep and resources allocation finished, the kernel is ready to be
@@ -213,7 +223,7 @@ args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
213223
Now the kernel can be launched:
214224

215225
```{code-cell} python
216-
err, = cuda.cuLaunchKernel(
226+
checkCudaErrors(cuda.cuLaunchKernel(
217227
kernel,
218228
NUM_BLOCKS, # grid x dim
219229
1, # grid y dim
@@ -225,12 +235,12 @@ err, = cuda.cuLaunchKernel(
225235
stream, # stream
226236
args.ctypes.data, # kernel arguments
227237
0, # extra (ignore)
228-
)
238+
))
229239

230-
err, = cuda.cuMemcpyDtoHAsync(
240+
checkCudaErrors(cuda.cuMemcpyDtoHAsync(
231241
hOut.ctypes.data, dOutclass, bufferSize, stream
232-
)
233-
err, = cuda.cuStreamSynchronize(stream)
242+
))
243+
checkCudaErrors(cuda.cuStreamSynchronize(stream))
234244
```
235245

236246
The `cuLaunchKernel` function takes the compiled module kernel and execution
@@ -252,12 +262,12 @@ Perform verification of the data to ensure correctness and finish the code with
252262
memory clean up.
253263

254264
```{code-cell} python
255-
err, = cuda.cuStreamDestroy(stream)
256-
err, = cuda.cuMemFree(dXclass)
257-
err, = cuda.cuMemFree(dYclass)
258-
err, = cuda.cuMemFree(dOutclass)
259-
err, = cuda.cuModuleUnload(module)
260-
err, = cuda.cuCtxDestroy(context)
265+
checkCudaErrors(cuda.cuStreamDestroy(stream))
266+
checkCudaErrors(cuda.cuMemFree(dXclass))
267+
checkCudaErrors(cuda.cuMemFree(dYclass))
268+
checkCudaErrors(cuda.cuMemFree(dOutclass))
269+
checkCudaErrors(cuda.cuModuleUnload(module))
270+
checkCudaErrors(cuda.cuCtxDestroy(context))
261271
```
262272

263273
## Performance

0 commit comments

Comments
 (0)