Skip to content

Commit 52e7ca9

Browse files
[LLVM][NVPTX] Add support for ldmatrix extensions introduced in PTX 8.6 (#124899)
This commit adds support for the following ldmatrix extensions introduced in PTX 8.6 - Support for m16n16 with b8 type with mandatory transpose - Support for m16n16 with m8n16 with source and desitination formats The above extensions are only supported on sm_100a, sm_101a, sm_120a Please refer the PTX ISA for more information: https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-ldmatrix
1 parent 269c40f commit 52e7ca9

File tree

7 files changed

+176
-15
lines changed

7 files changed

+176
-15
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
7272
string frag = Frag;
7373
string ptx_elt_type = PtxEltType;
7474
string gft = Geom#":"#Frag#":"#ptx_elt_type;
75+
string gf = Geom#":"#Frag;
7576
string ft = frag#":"#ptx_elt_type;
7677
list<LLVMType> regs = !cond(
7778
// mma fp ops use smaller fragments than wmma fp ops
@@ -214,9 +215,19 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
214215
!eq(gft,"m16n8k256:d:s32") : !listsplat(llvm_i32_ty, 4),
215216

216217
// ldmatrix b16 -> s32 @ m8n8
217-
!eq(gft,"m8n8:x1:b16") : !listsplat(llvm_i32_ty, 1),
218-
!eq(gft,"m8n8:x2:b16") : !listsplat(llvm_i32_ty, 2),
219-
!eq(gft,"m8n8:x4:b16") : !listsplat(llvm_i32_ty, 4),
218+
!eq(gf,"m8n8:x1") : !listsplat(llvm_i32_ty, 1),
219+
!eq(gf,"m8n8:x2") : !listsplat(llvm_i32_ty, 2),
220+
!eq(gf,"m8n8:x4") : !listsplat(llvm_i32_ty, 4),
221+
222+
// ldmatrix b8, b8x16.b6x16_p32, b8x16.b4x16_p64 -> s32 @ m16n16
223+
!eq(gf,"m16n16:x1") : !listsplat(llvm_i32_ty, 2),
224+
!eq(gf,"m16n16:x2") : !listsplat(llvm_i32_ty, 4),
225+
226+
// ldmatrix b8x16.b6x16_p32, b8x16.b4x16_p64 -> s32 @ m8n16
227+
!eq(gf,"m8n16:x1") : !listsplat(llvm_i32_ty, 1),
228+
!eq(gf,"m8n16:x2") : !listsplat(llvm_i32_ty, 2),
229+
!eq(gf,"m8n16:x4") : !listsplat(llvm_i32_ty, 4),
230+
220231
);
221232
}
222233

@@ -421,7 +432,16 @@ class NVVM_MMA_OPS {
421432

422433
list<WMMA_REGS> ldmatrix_b16_ops = LDMATRIX_OPS<
423434
["m8n8"], ["x1", "x2", "x4"], ["b16"]>.ret;
424-
list<WMMA_REGS> all_ldmatrix_ops = ldmatrix_b16_ops;
435+
436+
list<WMMA_REGS> ldmatrix_geom_m16n16_ops = LDMATRIX_OPS<
437+
["m16n16"], ["x1", "x2"], ["b8", "b8x16.b6x16_p32", "b8x16.b4x16_p64"]>.ret;
438+
439+
list<WMMA_REGS> ldmatrix_geom_m8n16_ops = LDMATRIX_OPS<
440+
["m8n16"], ["x1", "x2", "x4"], ["b8x16.b6x16_p32", "b8x16.b4x16_p64"]>.ret;
441+
442+
list<WMMA_REGS> all_ldmatrix_ops = !listconcat(ldmatrix_b16_ops,
443+
ldmatrix_geom_m16n16_ops,
444+
ldmatrix_geom_m8n16_ops);
425445
}
426446

427447
def NVVM_MMA_OPS : NVVM_MMA_OPS;
@@ -546,13 +566,18 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
546566
// if NVVM_LDMATRIX_SUPPORTED<...>.ret then
547567
// def : FOO<>; // The record will only be defined for supported ops.
548568
//
549-
class NVVM_LDMATRIX_SUPPORTED<WMMA_REGS frag> {
569+
class NVVM_LDMATRIX_SUPPORTED<WMMA_REGS frag, bit trans> {
550570
string g = frag.geom;
551571
string t = frag.ptx_elt_type;
552572

553573
bit ret = !cond(
554-
// Only currently support m8n8 and b16
555574
!and(!eq(g, "m8n8"), !eq(t, "b16")): true,
575+
!and(!eq(g, "m16n16"), !eq(t, "b8"), !eq(trans, 1)): true,
576+
!and(!eq(g, "m16n16"), !eq(t, "b8x16.b6x16_p32"), !eq(trans, 1)): true,
577+
!and(!eq(g, "m16n16"), !eq(t, "b8x16.b4x16_p64"), !eq(trans, 1)): true,
578+
!and(!eq(g, "m8n16"), !eq(t, "b8"), !eq(trans, 0)): true,
579+
!and(!eq(g, "m8n16"), !eq(t, "b8x16.b6x16_p32"), !eq(trans, 0)): true,
580+
!and(!eq(g, "m8n16"), !eq(t, "b8x16.b4x16_p64"), !eq(trans, 0)): true,
556581
true: false
557582
);
558583
}
@@ -4983,7 +5008,7 @@ class NVVM_LDMATRIX<WMMA_REGS Frag, int Transposed>
49835008

49845009
foreach transposed = [0, 1] in {
49855010
foreach frag = NVVM_MMA_OPS.all_ldmatrix_ops in {
4986-
if NVVM_LDMATRIX_SUPPORTED<frag>.ret then {
5011+
if NVVM_LDMATRIX_SUPPORTED<frag, transposed>.ret then {
49875012
def LDMATRIX_NAME<frag, transposed>.record
49885013
: NVVM_LDMATRIX<frag, transposed>;
49895014
}

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3681,7 +3681,12 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
36813681
case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
36823682
case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
36833683
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3684-
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3684+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
3685+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
3686+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
3687+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
3688+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
3689+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
36853690
Info.opc = ISD::INTRINSIC_W_CHAIN;
36863691
Info.memVT = MVT::v4i32;
36873692
Info.ptrVal = I.getArgOperand(0);
@@ -3721,7 +3726,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
37213726
case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
37223727
case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
37233728
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3724-
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3729+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
3730+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
3731+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
37253732
Info.opc = ISD::INTRINSIC_W_CHAIN;
37263733
Info.memVT = MVT::i32;
37273734
Info.ptrVal = I.getArgOperand(0);
@@ -3817,7 +3824,12 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
38173824
case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
38183825
case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
38193826
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3820-
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
3827+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
3828+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
3829+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
3830+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
3831+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
3832+
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
38213833
Info.opc = ISD::INTRINSIC_W_CHAIN;
38223834
Info.memVT = MVT::v2i32;
38233835
Info.ptrVal = I.getArgOperand(0);

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7052,6 +7052,9 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
70527052
!eq(ptx_elt_type, "tf32") : Int32Regs,
70537053
!eq(ptx_elt_type, "s32") : Int32Regs,
70547054
!eq(ptx_elt_type, "b16") : Int32Regs,
7055+
!eq(ptx_elt_type, "b8") : Int32Regs,
7056+
!eq(ptx_elt_type, "b8x16.b6x16_p32") : Int32Regs,
7057+
!eq(ptx_elt_type, "b8x16.b4x16_p64") : Int32Regs,
70557058
!eq(ptx_elt_type, "s8") : Int32Regs,
70567059
!eq(ptx_elt_type, "u8") : Int32Regs,
70577060
!eq(ptx_elt_type, "s4") : Int32Regs,
@@ -7139,7 +7142,27 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
71397142

71407143
!and(!eq(op,"ldmatrix"),
71417144
!eq(ptx_elt_type,"b16"),
7142-
!eq(geom, "m8n8")) : [hasSM<75>, hasPTX<65>]);
7145+
!eq(geom, "m8n8")) : [hasSM<75>, hasPTX<65>],
7146+
7147+
!and(!eq(op,"ldmatrix"),
7148+
!eq(ptx_elt_type,"b8"),
7149+
!eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
7150+
7151+
!and(!eq(op,"ldmatrix"),
7152+
!eq(ptx_elt_type,"b8x16.b6x16_p32"),
7153+
!eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
7154+
7155+
!and(!eq(op,"ldmatrix"),
7156+
!eq(ptx_elt_type,"b8x16.b4x16_p64"),
7157+
!eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
7158+
7159+
!and(!eq(op,"ldmatrix"),
7160+
!eq(ptx_elt_type,"b8x16.b6x16_p32"),
7161+
!eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
7162+
7163+
!and(!eq(op,"ldmatrix"),
7164+
!eq(ptx_elt_type,"b8x16.b4x16_p64"),
7165+
!eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
71437166

71447167
// template DAGs for instruction inputs/output.
71457168
dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -7414,7 +7437,7 @@ defset list<WMMA_INSTR> LDMATRIXs = {
74147437
foreach transposed = [false, true] in {
74157438
foreach space = [".shared", ""] in {
74167439
foreach frag = NVVM_MMA_OPS.all_ldmatrix_ops in
7417-
if NVVM_LDMATRIX_SUPPORTED<frag>.ret then
7440+
if NVVM_LDMATRIX_SUPPORTED<frag, transposed>.ret then
74187441
def : LDMATRIX<WMMA_REGINFO<frag, "ldmatrix">, transposed, space>;
74197442
} // space
74207443
} // transposed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Check all variants of instructions supported by PTX86 on SM100a
2+
# RUN: %python %s --ptx=86 --gpu-arch=100 --aa > %t-ptx86-sm_100a.ll
3+
# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \
4+
# RUN: --check-prefixes=PTX86LDMATRIX-DAG
5+
# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \
6+
# RUN: --check-prefixes=PTX86LDMATRIX-DAG
7+
# RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \
8+
# RUN: | FileCheck %t-ptx86-sm_100a.ll
9+
# RUN: %if ptxas-12.7 %{ \
10+
# RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \
11+
# RUN: | %ptxas-verify -arch=sm_100a \
12+
# RUN: %}
13+
14+
import wmma
15+
16+
wmma.main()
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Check all variants of instructions supported by PTX86 on SM101a
2+
# RUN: %python %s --ptx=86 --gpu-arch=101 --aa > %t-ptx86-sm_101a.ll
3+
# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \
4+
# RUN: --check-prefixes=PTX86LDMATRIX-DAG
5+
# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \
6+
# RUN: --check-prefixes=PTX86LDMATRIX-DAG
7+
# RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \
8+
# RUN: | FileCheck %t-ptx86-sm_101a.ll
9+
# RUN: %if ptxas-12.7 %{ \
10+
# RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \
11+
# RUN: | %ptxas-verify -arch=sm_101a \
12+
# RUN: %}
13+
14+
import wmma
15+
16+
wmma.main()
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Check all variants of instructions supported by PTX86 on SM120a
2+
# RUN: %python %s --ptx=86 --gpu-arch=120 --aa > %t-ptx86-sm_120a.ll
3+
# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \
4+
# RUN: --check-prefixes=PTX86LDMATRIX-DAG
5+
# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \
6+
# RUN: --check-prefixes=PTX86LDMATRIX-DAG
7+
# RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \
8+
# RUN: | FileCheck %t-ptx86-sm_120a.ll
9+
# RUN: %if ptxas-12.7 %{ \
10+
# RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \
11+
# RUN: | %ptxas-verify -arch=sm_120a \
12+
# RUN: %}
13+
14+
import wmma
15+
16+
wmma.main()

llvm/test/CodeGen/NVPTX/wmma.py

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ def __init__(self, ptx_type):
1919
"f64": "double",
2020
"s32": "i32",
2121
"b16": "i32",
22+
"b8": "i32",
23+
"b8x16.b6x16_p32": "i32",
24+
"b8x16.b4x16_p64": "i32",
2225
"s8": "i32",
2326
"u8": "i32",
2427
"s4": "i32",
@@ -161,6 +164,18 @@ def __init__(self, geom, frag, ptx_elt_type):
161164
"m8n8:x1:b16": 1,
162165
"m8n8:x2:b16": 2,
163166
"m8n8:x4:b16": 4,
167+
"m16n16:x1:b8": 2,
168+
"m16n16:x2:b8": 4,
169+
"m16n16:x1:b8x16.b6x16_p32": 2,
170+
"m16n16:x2:b8x16.b6x16_p32": 4,
171+
"m16n16:x1:b8x16.b4x16_p64": 2,
172+
"m16n16:x2:b8x16.b4x16_p64": 4,
173+
"m8n16:x1:b8x16.b6x16_p32": 1,
174+
"m8n16:x2:b8x16.b6x16_p32": 2,
175+
"m8n16:x4:b8x16.b6x16_p32": 4,
176+
"m8n16:x1:b8x16.b4x16_p64": 1,
177+
"m8n16:x2:b8x16.b4x16_p64": 2,
178+
"m8n16:x4:b8x16.b4x16_p64": 4,
164179
}.get(
165180
"%s:%s:%s" % (geom, frag, ptx_elt_type),
166181
{
@@ -289,7 +304,15 @@ def get_ldst_ops(kind):
289304

290305

291306
def get_ldmatrix_ops():
292-
return make_ldmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"])
307+
return (
308+
make_ldmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"])
309+
+ make_ldmatrix_ops(
310+
["m16n16"], ["x1", "x2"], ["b8", "b8x16.b6x16_p32", "b8x16.b4x16_p64"]
311+
)
312+
+ make_ldmatrix_ops(
313+
["m8n16"], ["x1", "x2", "x4"], ["b8x16.b6x16_p32", "b8x16.b4x16_p64"]
314+
)
315+
)
293316

294317

295318
def is_wmma_geom_supported(geom):
@@ -330,9 +353,22 @@ def is_mma_geom_supported(geom):
330353
def is_ldmatrix_geom_supported(geom):
331354
if geom in ["m8n8"]:
332355
return ptx_version >= 65 and gpu_arch >= 75
356+
elif geom in ["m16n16"]:
357+
return ptx_version >= 86 and gpu_arch >= 100 and aa
358+
elif geom in ["m8n16"]:
359+
return ptx_version >= 86 and gpu_arch >= 100 and aa
333360
assert False # Unexpected geometry.
334361

335362

363+
def is_ldmatrix_trans_supported(geom, trans):
364+
if geom in ["m8n8"]:
365+
return True
366+
elif geom in ["m16n16"]:
367+
return trans == ".trans"
368+
elif geom in ["m8n16"]:
369+
return trans == ""
370+
assert False # Unexpected geometry.
371+
336372
def is_type_supported(ptx_type):
337373
if ptx_type in ["s8", "u8", "s32"]:
338374
return ptx_version >= 63 and gpu_arch >= 72
@@ -417,10 +453,11 @@ def is_ldst_variant_supported(frag, layout):
417453
return True
418454

419455

420-
def is_ldmatrix_variant_supported(frag):
456+
def is_ldmatrix_variant_supported(frag, trans):
421457
if not (
422458
is_type_supported(frag.mma_type.ptx_type)
423459
and is_ldmatrix_geom_supported(frag.geom)
460+
and is_ldmatrix_trans_supported(frag.geom, trans)
424461
):
425462
return False
426463
return frag.frag in ["x1", "x2", "x4"]
@@ -653,7 +690,7 @@ def gen_ldmatrix_tests():
653690
["", ".shared"],
654691
["", ".trans"],
655692
):
656-
if not is_ldmatrix_variant_supported(frag):
693+
if not is_ldmatrix_variant_supported(frag, trans):
657694
continue
658695

659696
params = {
@@ -944,6 +981,19 @@ def gen_check_unsupported_ops(items):
944981
; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16
945982
; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16
946983
984+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m16n16.x1.trans.shared.b8
985+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m16n16.x2.trans.shared.b8
986+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m16n16.x1.trans.b8x16.b6x16_p32
987+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m16n16.x1.trans.b8x16.b4x16_p64
988+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m16n16.x2.trans.b8x16.b6x16_p32
989+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m16n16.x2.trans.b8x16.b4x16_p64
990+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x1.b8x16.b6x16_p32
991+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x1.b8x16.b4x16_p64
992+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x2.b8x16.b6x16_p32
993+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x2.b8x16.b4x16_p64
994+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b6x16_p32
995+
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b4x16_p64
996+
947997
; PTX71MMA-DAG: mma.m8n8k4.row.col.f64
948998
; PTX71MMA-DAG: mma.m16n8k4.row.col.tf32
949999
; PTX71MMA-DAG: mma.m16n8k8.row.col.tf32
@@ -997,13 +1047,16 @@ def gen_tests():
9971047
def main():
9981048
global ptx_version
9991049
global gpu_arch
1050+
global aa
10001051
parser = argparse.ArgumentParser()
10011052
parser.add_argument("--ptx", type=int, default=60)
10021053
parser.add_argument("--gpu-arch", type=int, default=70)
1054+
parser.add_argument("--aa", action="store_true")
10031055
args = parser.parse_args()
10041056

10051057
ptx_version = args.ptx
10061058
gpu_arch = args.gpu_arch
1059+
aa = args.aa
10071060

10081061
gen_tests()
10091062

0 commit comments

Comments
 (0)