@@ -130,6 +130,51 @@ func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
130
130
return
131
131
}
132
132
133
+ // CHECK-LABEL: @tma_load_multicast1d
134
+ func.func @tma_load_multicast1d (%tmaDescriptor: !llvm.ptr , %dest : !llvm.ptr <3 >, %barrier: !llvm.ptr <3 >, %multicastMask : i16 , %crd0: i32 , %p : i1 ) {
135
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4} ], [$2], $3;", "r,l,r,h,r"
136
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ] : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32
137
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4} ], [$2], $3;", "r,l,r,h,r,b"
138
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ], predicate =%p : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 ,i1
139
+ return
140
+ }
141
+
142
+ // CHECK-LABEL: @tma_load_multicast2d
143
+ func.func @tma_load_multicast2d (%tmaDescriptor: !llvm.ptr , %dest : !llvm.ptr <3 >, %barrier: !llvm.ptr <3 >, %multicastMask : i16 , %crd0: i32 , %crd1: i32 , %p : i1 ) {
144
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5} ], [$2], $3;", "r,l,r,h,r,r"
145
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ] : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32
146
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5} ], [$2], $3;", "r,l,r,h,r,r,b"
147
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ], predicate =%p : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32 , i1
148
+ return
149
+ }
150
+
151
+ // CHECK-LABEL: @tma_load_multicast3d
152
+ func.func @tma_load_multicast3d (%tmaDescriptor: !llvm.ptr , %dest : !llvm.ptr <3 >, %barrier: !llvm.ptr <3 >, %multicastMask : i16 , %crd0: i32 , %crd1: i32 , %crd2: i32 , %p : i1 ) {
153
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6} ], [$2], $3;", "r,l,r,h,r,r,r"
154
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ,%crd2 ] : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32 , i32
155
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6} ], [$2], $3;", "r,l,r,h,r,r,r,b"
156
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ,%crd2 ], predicate =%p : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32 , i32 , i1
157
+ return
158
+ }
159
+
160
+ // CHECK-LABEL: @tma_load_multicast4d
161
+ func.func @tma_load_multicast4d (%tmaDescriptor: !llvm.ptr , %dest : !llvm.ptr <3 >, %barrier: !llvm.ptr <3 >, %multicastMask : i16 , %crd0: i32 , %crd1: i32 , %crd2: i32 , %crd3: i32 , %p : i1 ) {
162
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6, $7} ], [$2], $3;", "r,l,r,h,r,r,r,r"
163
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ,%crd2 ,%crd3 ] : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32 , i32 , i32
164
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6} ], [$2], $3;", "r,l,r,h,r,r,r,b"
165
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ,%crd2 ], predicate =%p : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32 , i32 , i1
166
+ return
167
+ }
168
+
169
+ // CHECK-LABEL: @tma_load_multicast5d
170
+ func.func @tma_load_multicast5d (%tmaDescriptor: !llvm.ptr , %dest : !llvm.ptr <3 >, %barrier: !llvm.ptr <3 >, %multicastMask : i16 , %crd0: i32 , %crd1: i32 , %crd2: i32 , %crd3: i32 , %crd4: i32 , %p : i1 ) {
171
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6, $7, $8} ], [$2], $3;", "r,l,r,h,r,r,r,r,r"
172
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ,%crd2 ,%crd3 ,%crd4 ] : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32 , i32 , i32 , i32
173
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6, $7, $8} ], [$2], $3;", "r,l,r,h,r,r,r,r,r,b"
174
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest , %tmaDescriptor , %barrier , multicast_mask = %multicastMask , box [%crd0 ,%crd1 ,%crd2 ,%crd3 ,%crd4 ], predicate =%p : !llvm.ptr <3 >, !llvm.ptr , !llvm.ptr <3 >, i16 , i32 , i32 , i32 , i32 , i32 , i1
175
+ return
176
+ }
177
+
133
178
// CHECK-LABEL: @tma_store_1d
134
179
func.func @tma_store_1d (%tmaDescriptor: !llvm.ptr , %src : !llvm.ptr <3 >, %crd0: i32 , %p : i1 ) {
135
180
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r"
0 commit comments