|
3 | 3 | // RUN: %clangxx -Xclang -no-opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 -S -Xclang -emit-llvm %s -o -| FileCheck %s
|
4 | 4 | // RUN: %clangxx -Xclang -opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 -S -Xclang -emit-llvm %s -o -| FileCheck %s --check-prefixes=CHECK-OPAQUE
|
5 | 5 |
|
6 |
| -// IMPORTANT: before updating sm version support beyond sm_86 read the following |
| 6 | +// IMPORTANT: before updating sm version support beyond sm_90 read the following |
7 | 7 | // NOTE!
|
8 | 8 |
|
9 | 9 | // NOTE: Technically the 'wrong' ptx instruction is called by
|
10 | 10 | // joint_matrix_load/joint_matrix_store in this case: notice that the load and
|
11 | 11 | // store instructions use shape m16n16k16, rather than the correct shape
|
12 | 12 | // m16n16k8. The 'wrong' ptx instruction is used because it returns the correct
|
13 |
| -// SASS instructions for all existing supported sm versions: sm_80 and sm_86. |
14 |
| -// The reason for this ptx instruction redundancy is due to the ptx naming |
15 |
| -// convention for the mnk shape triple; however we cannot in principle a priori |
16 |
| -// know that future sm versions will behave in the same way and that this |
17 |
| -// redundancy will continue as future architecture is released. This should be |
18 |
| -// validated before supporting any sm versions beyond sm_86. The reason that we |
19 |
| -// choose to use the m16n16k16 instruction is that it allows the significant |
20 |
| -// advantage of being able to use a portable interface across Intel and Nvidia |
| 13 | +// SASS instructions for all existing sm versions supporting tf32: sm_80, sm_86, |
| 14 | +// sm_87, sm_89, and sm_90. The reason for this ptx instruction redundancy is |
| 15 | +// due to the ptx naming convention for the mnk shape triple; however we cannot |
| 16 | +// in principle a priori know that future sm versions will behave in the same |
| 17 | +// way and that this redundancy will continue as future architecture is |
| 18 | +// released. This should be validated before supporting any sm versions beyond |
| 19 | +// sm_90. The reason that we choose to use the m16n16k16 instruction is that it |
| 20 | +// allows us to use a simpler portable interface across Intel and Nvidia |
21 | 21 | // backends.
|
22 | 22 |
|
23 | 23 | #include <sycl/sycl.hpp>
|
|
0 commit comments