Skip to content

Commit bef50a4

Browse files
committed
Rebase to CTK 11.5
1 parent 427c597 commit bef50a4

File tree

103 files changed

+14297
-66841
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+14297
-66841
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
cuda/_version.py export-subst

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ __pycache__/
1212
.benchmarks/
1313
*.cpp
1414
!cuda/_lib/param_packer.cpp
15+
!cuda/_cuda/loader.cpp
1516

1617
# Distribution / packaging
1718
.Python
@@ -77,6 +78,7 @@ instance/
7778

7879
# Sphinx documentation
7980
docs/_build/
81+
docs_src/_build/
8082

8183
# PyBuilder
8284
.pybuilder/
@@ -142,4 +144,4 @@ dmypy.json
142144
.pytype/
143145

144146
# Cython debug symbols
145-
cython_debug/
147+
cython_debug/

README.md

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ Dependencies of the CUDA-Python bindings and some versions that are known to
88
work are as follows:
99

1010
* Driver: Linux (450.80.02 or later) Windows(456.38 or later)
11-
* CUDA Toolkit 11.0 to 11.4 - e.g. 11.4.48
11+
* CUDA Toolkit 11.0 to 11.5
1212
* Cython - e.g. 0.29.21
13-
* Versioneer - e.g. 0.20
1413

1514
### Compilation
1615

@@ -36,20 +35,6 @@ to use the module in-place in your current Python environment (e.g. for testing
3635
of porting other libraries to use the binding).
3736

3837

39-
### Build the Docs
40-
41-
```
42-
conda env create -f docs/environment-docs.yml
43-
conda activate cuda-python-docs
44-
```
45-
Then compile and install `cuda-python` following the steps above.
46-
47-
```
48-
cd docs
49-
make html
50-
open build/html/index.html
51-
```
52-
5338
### Build the Docs
5439

5540
```
@@ -81,7 +66,6 @@ Dependencies of the test execution and some versions that are known to
8166
work are as follows:
8267

8368
* numpy-1.19.5
84-
* numba-0.53.1
8569
* matplotlib-3.3.4
8670
* scipy-1.6.3
8771
* pytest-benchmark-3.4.1
@@ -91,30 +75,29 @@ work are as follows:
9175
You can run the included tests with:
9276

9377
```
94-
pytest
78+
python -m pytest
9579
```
80+
### Benchmark
9681

97-
### Samples
98-
99-
You can run the included tests with:
82+
You can run benchmark only tests with:
10083

10184
```
102-
pytest examples
85+
python -m pytest --benchmark-only
10386
```
10487

105-
### Benchmark
88+
### Samples
10689

107-
You can run benchmark only tests with:
90+
You can run the included tests with:
10891

10992
```
110-
pytest --benchmark-only
93+
python -m pytest examples
11194
```
11295

11396
## Examples
11497

11598
The included examples are:
11699

117-
- `examples/extra/jit_program.py`: Demonstrates the use of the API to compile and
100+
- `examples/extra/jit_program_test.py`: Demonstrates the use of the API to compile and
118101
launch a kernel on the device. Includes device memory allocation /
119102
deallocation, transfers between host and device, creation and usage of
120103
streams, and context management.

cuda/_cuda/ccuda.pxd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ cdef CUresult _cuDeviceGetMemPool(CUmemoryPool* pool, CUdevice dev) nogil except
4141

4242
cdef CUresult _cuDeviceGetDefaultMemPool(CUmemoryPool* pool_out, CUdevice dev) nogil except ?CUDA_ERROR_NOT_FOUND
4343

44+
cdef CUresult _cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope) nogil except ?CUDA_ERROR_NOT_FOUND
45+
4446
cdef CUresult _cuDeviceGetProperties(CUdevprop* prop, CUdevice dev) nogil except ?CUDA_ERROR_NOT_FOUND
4547

4648
cdef CUresult _cuDeviceComputeCapability(int* major, int* minor, CUdevice dev) nogil except ?CUDA_ERROR_NOT_FOUND
@@ -704,5 +706,3 @@ cdef CUresult _cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource*
704706
cdef CUresult _cuGetProcAddress(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags) nogil except ?CUDA_ERROR_NOT_FOUND
705707

706708
cdef CUresult _cuGetExportTable(const void** ppExportTable, const CUuuid* pExportTableId) nogil except ?CUDA_ERROR_NOT_FOUND
707-
708-
cdef CUresult _cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope) nogil except ?CUDA_ERROR_NOT_FOUND

cuda/_cuda/ccuda.pyx

Lines changed: 41 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
IF UNAME_SYSNAME == "Windows":
99
import win32api
1010
import struct
11+
from pywintypes import error
1112
ELSE:
1213
cimport cuda._lib.dlfcn as dlfcn
14+
import sys
15+
cimport cuda._cuda.loader as loader
1316
cdef bint __cuPythonInit = False
1417
cdef void *__cuGetErrorString = NULL
1518
cdef void *__cuGetErrorName = NULL
@@ -28,6 +31,7 @@ cdef void *__cuDeviceGetNvSciSyncAttributes = NULL
2831
cdef void *__cuDeviceSetMemPool = NULL
2932
cdef void *__cuDeviceGetMemPool = NULL
3033
cdef void *__cuDeviceGetDefaultMemPool = NULL
34+
cdef void *__cuFlushGPUDirectRDMAWrites = NULL
3135
cdef void *__cuDeviceGetProperties = NULL
3236
cdef void *__cuDeviceComputeCapability = NULL
3337
cdef void *__cuDevicePrimaryCtxRetain = NULL
@@ -360,29 +364,34 @@ cdef void *__cuGraphicsMapResources = NULL
360364
cdef void *__cuGraphicsUnmapResources = NULL
361365
cdef void *__cuGetProcAddress = NULL
362366
cdef void *__cuGetExportTable = NULL
363-
cdef void *__cuFlushGPUDirectRDMAWrites = NULL
364367

365368
cdef int cuPythonInit() nogil except -1:
366369
global __cuPythonInit
367370
if __cuPythonInit:
368371
return 0
369372
__cuPythonInit = True
370-
IF UNAME_SYSNAME == "Windows":
371-
LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
372-
with gil:
373-
if 8 * struct.calcsize("P") == 32:
374-
try:
375-
handle = win32api.LoadLibraryEx('nvcuda32.dll', 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
376-
except:
377-
handle = win32api.LoadLibraryEx('nvcuda.dll', 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
378-
else:
379-
handle = win32api.LoadLibraryEx('nvcuda.dll', 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
380-
ELSE:
381-
handle = dlfcn.dlopen('libcuda.so', dlfcn.RTLD_NOW)
382-
if (handle == NULL):
383-
with gil:
373+
cdef char libPath[260]
374+
libPath[0] = 0
375+
with gil:
376+
status = loader.getCUDALibraryPath(libPath, sys.maxsize > 2**32)
377+
if status == 0 and len(libPath) != 0:
378+
path = libPath.decode('utf-8')
379+
else:
380+
IF UNAME_SYSNAME == "Windows":
381+
path = 'nvcuda.dll'
382+
ELSE:
383+
path = 'libcuda.so'
384+
385+
IF UNAME_SYSNAME == "Windows":
386+
LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
387+
try:
388+
handle = win32api.LoadLibraryEx(path, 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
389+
except error as e:
390+
raise RuntimeError('Failed to LoadLibraryEx ' + path)
391+
ELSE:
392+
handle = dlfcn.dlopen(bytes(path, encoding='utf-8'), dlfcn.RTLD_NOW)
393+
if (handle == NULL):
384394
raise RuntimeError('Failed to dlopen libcuda.so')
385-
386395
# All Globals
387396
global __cuGetErrorString
388397
global __cuGetErrorName
@@ -401,6 +410,7 @@ cdef int cuPythonInit() nogil except -1:
401410
global __cuDeviceSetMemPool
402411
global __cuDeviceGetMemPool
403412
global __cuDeviceGetDefaultMemPool
413+
global __cuFlushGPUDirectRDMAWrites
404414
global __cuDeviceGetProperties
405415
global __cuDeviceComputeCapability
406416
global __cuDevicePrimaryCtxRetain
@@ -733,7 +743,6 @@ cdef int cuPythonInit() nogil except -1:
733743
global __cuGraphicsUnmapResources
734744
global __cuGetProcAddress
735745
global __cuGetExportTable
736-
global __cuFlushGPUDirectRDMAWrites
737746
# Get latest __cuGetProcAddress
738747
IF UNAME_SYSNAME == "Windows":
739748
with gil:
@@ -763,6 +772,7 @@ cdef int cuPythonInit() nogil except -1:
763772
_cuGetProcAddress('cuDeviceSetMemPool', &__cuDeviceSetMemPool, 11020, 0)
764773
_cuGetProcAddress('cuDeviceGetMemPool', &__cuDeviceGetMemPool, 11020, 0)
765774
_cuGetProcAddress('cuDeviceGetDefaultMemPool', &__cuDeviceGetDefaultMemPool, 11020, 0)
775+
_cuGetProcAddress('cuFlushGPUDirectRDMAWrites', &__cuFlushGPUDirectRDMAWrites, 11030, 0)
766776
_cuGetProcAddress('cuDeviceGetProperties', &__cuDeviceGetProperties, 2000, 0)
767777
_cuGetProcAddress('cuDeviceComputeCapability', &__cuDeviceComputeCapability, 2000, 0)
768778
_cuGetProcAddress('cuDevicePrimaryCtxRetain', &__cuDevicePrimaryCtxRetain, 7000, 0)
@@ -1095,7 +1105,6 @@ cdef int cuPythonInit() nogil except -1:
10951105
_cuGetProcAddress('cuGraphicsUnmapResources', &__cuGraphicsUnmapResources, 3000, 0)
10961106
_cuGetProcAddress('cuGetProcAddress', &__cuGetProcAddress, 11030, 0)
10971107
_cuGetProcAddress('cuGetExportTable', &__cuGetExportTable, 3000, 0)
1098-
_cuGetProcAddress('cuFlushGPUDirectRDMAWrites', &__cuFlushGPUDirectRDMAWrites, 11030, 0)
10991108
return 0
11001109
# dlsym calls
11011110
IF UNAME_SYSNAME == "Windows":
@@ -1168,6 +1177,10 @@ cdef int cuPythonInit() nogil except -1:
11681177
__cuDeviceGetDefaultMemPool = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuDeviceGetDefaultMemPool')
11691178
except:
11701179
pass
1180+
try:
1181+
__cuFlushGPUDirectRDMAWrites = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuFlushGPUDirectRDMAWrites')
1182+
except:
1183+
pass
11711184
try:
11721185
__cuDeviceGetProperties = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuDeviceGetProperties')
11731186
except:
@@ -2496,10 +2509,6 @@ cdef int cuPythonInit() nogil except -1:
24962509
__cuGetExportTable = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGetExportTable')
24972510
except:
24982511
pass
2499-
try:
2500-
__cuFlushGPUDirectRDMAWrites = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuFlushGPUDirectRDMAWrites')
2501-
except:
2502-
pass
25032512
ELSE:
25042513
__cuGetErrorString = dlfcn.dlsym(handle, 'cuGetErrorString')
25052514
__cuGetErrorName = dlfcn.dlsym(handle, 'cuGetErrorName')
@@ -2518,6 +2527,7 @@ cdef int cuPythonInit() nogil except -1:
25182527
__cuDeviceSetMemPool = dlfcn.dlsym(handle, 'cuDeviceSetMemPool')
25192528
__cuDeviceGetMemPool = dlfcn.dlsym(handle, 'cuDeviceGetMemPool')
25202529
__cuDeviceGetDefaultMemPool = dlfcn.dlsym(handle, 'cuDeviceGetDefaultMemPool')
2530+
__cuFlushGPUDirectRDMAWrites = dlfcn.dlsym(handle, 'cuFlushGPUDirectRDMAWrites')
25212531
__cuDeviceGetProperties = dlfcn.dlsym(handle, 'cuDeviceGetProperties')
25222532
__cuDeviceComputeCapability = dlfcn.dlsym(handle, 'cuDeviceComputeCapability')
25232533
__cuDevicePrimaryCtxRetain = dlfcn.dlsym(handle, 'cuDevicePrimaryCtxRetain')
@@ -2850,7 +2860,6 @@ cdef int cuPythonInit() nogil except -1:
28502860
__cuGraphicsUnmapResources = dlfcn.dlsym(handle, 'cuGraphicsUnmapResources')
28512861
__cuGetProcAddress = dlfcn.dlsym(handle, 'cuGetProcAddress')
28522862
__cuGetExportTable = dlfcn.dlsym(handle, 'cuGetExportTable')
2853-
__cuFlushGPUDirectRDMAWrites = dlfcn.dlsym(handle, 'cuFlushGPUDirectRDMAWrites')
28542863

28552864
cdef CUresult _cuGetErrorString(CUresult error, const char** pStr) nogil except ?CUDA_ERROR_NOT_FOUND:
28562865
global __cuGetErrorString
@@ -3005,6 +3014,15 @@ cdef CUresult _cuDeviceGetDefaultMemPool(CUmemoryPool* pool_out, CUdevice dev) n
30053014
err = (<CUresult (*)(CUmemoryPool*, CUdevice) nogil> __cuDeviceGetDefaultMemPool)(pool_out, dev)
30063015
return err
30073016

3017+
cdef CUresult _cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope) nogil except ?CUDA_ERROR_NOT_FOUND:
3018+
global __cuFlushGPUDirectRDMAWrites
3019+
cuPythonInit()
3020+
if __cuFlushGPUDirectRDMAWrites == NULL:
3021+
with gil:
3022+
raise RuntimeError('Function "cuFlushGPUDirectRDMAWrites" not found')
3023+
err = (<CUresult (*)(CUflushGPUDirectRDMAWritesTarget, CUflushGPUDirectRDMAWritesScope) nogil> __cuFlushGPUDirectRDMAWrites)(target, scope)
3024+
return err
3025+
30083026
cdef CUresult _cuDeviceGetProperties(CUdevprop* prop, CUdevice dev) nogil except ?CUDA_ERROR_NOT_FOUND:
30093027
global __cuDeviceGetProperties
30103028
cuPythonInit()
@@ -5992,12 +6010,3 @@ cdef CUresult _cuGetExportTable(const void** ppExportTable, const CUuuid* pExpor
59926010
raise RuntimeError('Function "cuGetExportTable" not found')
59936011
err = (<CUresult (*)(const void**, const CUuuid*) nogil> __cuGetExportTable)(ppExportTable, pExportTableId)
59946012
return err
5995-
5996-
cdef CUresult _cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope) nogil except ?CUDA_ERROR_NOT_FOUND:
5997-
global __cuFlushGPUDirectRDMAWrites
5998-
cuPythonInit()
5999-
if __cuFlushGPUDirectRDMAWrites == NULL:
6000-
with gil:
6001-
raise RuntimeError('Function "cuFlushGPUDirectRDMAWrites" not found')
6002-
err = (<CUresult (*)(CUflushGPUDirectRDMAWritesTarget, CUflushGPUDirectRDMAWritesScope) nogil> __cuFlushGPUDirectRDMAWrites)(target, scope)
6003-
return err

0 commit comments

Comments
 (0)