@@ -2138,6 +2138,150 @@ def NVVM_CpAsyncBulkTensorReduceOp :
2138
2138
}];
2139
2139
}
2140
2140
2141
+ def NVVM_CpAsyncBulkGlobalToSharedClusterOp :
2142
+ NVVM_Op<"cp.async.bulk.shared.cluster.global", [AttrSizedOperandSegments]> {
2143
+ let summary = "Async bulk copy from global memory to Shared cluster memory";
2144
+ let description = [{
2145
+ Initiates an asynchronous copy operation from global memory to cluster's
2146
+ shared memory.
2147
+
2148
+ The `multicastMask` operand is optional. When it is present, the Op copies
2149
+ data from global memory to shared memory of multiple CTAs in the cluster.
2150
+ Operand `multicastMask` specifies the destination CTAs in the cluster such
2151
+ that each bit position in the 16-bit `multicastMask` operand corresponds to
2152
+ the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
2153
+
2154
+ The `l2CacheHint` operand is optional, and it is used to specify cache
2155
+ eviction policy that may be used during the memory access.
2156
+ [For more information, see PTX ISA]
2157
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
2158
+ }];
2159
+
2160
+ let arguments = (ins
2161
+ LLVM_PointerShared:$dstMem,
2162
+ LLVM_PointerGlobal:$srcMem,
2163
+ LLVM_PointerShared:$mbar,
2164
+ I32:$size,
2165
+ Optional<I16>:$multicastMask,
2166
+ Optional<I64>:$l2CacheHint);
2167
+
2168
+ let assemblyFormat = [{
2169
+ $dstMem `,` $srcMem `,` $mbar `,` $size
2170
+ (`multicast_mask` `=` $multicastMask^ )?
2171
+ (`l2_cache_hint` `=` $l2CacheHint^ )?
2172
+ attr-dict `:` type($dstMem) `,` type($srcMem)
2173
+ }];
2174
+
2175
+ string llvmBuilder = [{
2176
+ // Arguments to the intrinsic:
2177
+ // dst, mbar, src, size
2178
+ // multicast_mask, cache_hint,
2179
+ // flag for multicast_mask,
2180
+ // flag for cache_hint
2181
+ llvm::SmallVector<llvm::Value *> translatedOperands;
2182
+ translatedOperands.push_back($dstMem);
2183
+ translatedOperands.push_back($mbar);
2184
+ translatedOperands.push_back($srcMem);
2185
+ translatedOperands.push_back($size);
2186
+
2187
+ // Multicast, if available
2188
+ llvm::LLVMContext &ctx = moduleTranslation.getLLVMContext();
2189
+ auto *i16Unused = llvm::ConstantInt::get(llvm::Type::getInt16Ty(ctx), 0);
2190
+ bool isMulticast = op.getMulticastMask() ? true : false;
2191
+ translatedOperands.push_back(isMulticast ? $multicastMask : i16Unused);
2192
+
2193
+ // Cachehint, if available
2194
+ auto *i64Unused = llvm::ConstantInt::get(llvm::Type::getInt64Ty(ctx), 0);
2195
+ bool isCacheHint = op.getL2CacheHint() ? true : false;
2196
+ translatedOperands.push_back(isCacheHint ? $l2CacheHint : i64Unused);
2197
+
2198
+ // Flag arguments for multicast and cachehint
2199
+ translatedOperands.push_back(builder.getInt1(isMulticast));
2200
+ translatedOperands.push_back(builder.getInt1(isCacheHint));
2201
+
2202
+ createIntrinsicCall(builder,
2203
+ llvm::Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster, translatedOperands);
2204
+ }];
2205
+ }
2206
+
2207
+ def NVVM_CpAsyncBulkSharedCTAToSharedClusterOp :
2208
+ NVVM_Op<"cp.async.bulk.shared.cluster.shared.cta"> {
2209
+ let summary = "Async bulk copy from Shared CTA memory to Shared cluster memory";
2210
+ let description = [{
2211
+ Initiates an asynchronous copy operation from Shared CTA memory to Shared
2212
+ cluster memory.
2213
+
2214
+ [For more information, see PTX ISA]
2215
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
2216
+ }];
2217
+
2218
+ let arguments = (ins
2219
+ LLVM_PointerShared:$dstMem,
2220
+ LLVM_PointerShared:$srcMem,
2221
+ LLVM_PointerShared:$mbar,
2222
+ I32:$size);
2223
+
2224
+ let assemblyFormat = [{
2225
+ $dstMem `,` $srcMem `,` $mbar `,` $size
2226
+ attr-dict `:` type($dstMem) `,` type($srcMem)
2227
+ }];
2228
+
2229
+ string llvmBuilder = [{
2230
+ createIntrinsicCall(builder,
2231
+ llvm::Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster,
2232
+ {$dstMem, $mbar, $srcMem, $size});
2233
+ }];
2234
+ }
2235
+
2236
+ def NVVM_CpAsyncBulkSharedCTAToGlobalOp :
2237
+ NVVM_Op<"cp.async.bulk.global.shared.cta"> {
2238
+ let summary = "Async bulk copy from Shared CTA memory to Global memory";
2239
+ let description = [{
2240
+ Initiates an asynchronous copy operation from Shared CTA memory to
2241
+ global memory.
2242
+
2243
+ The `l2CacheHint` operand is optional, and it is used to specify cache
2244
+ eviction policy that may be used during the memory access.
2245
+ [For more information, see PTX ISA]
2246
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
2247
+ }];
2248
+
2249
+ let arguments = (ins
2250
+ LLVM_PointerGlobal:$dstMem,
2251
+ LLVM_PointerShared:$srcMem,
2252
+ I32:$size,
2253
+ Optional<I64>:$l2CacheHint);
2254
+
2255
+ let assemblyFormat = [{
2256
+ $dstMem `,` $srcMem `,` $size
2257
+ (`l2_cache_hint` `=` $l2CacheHint^ )?
2258
+ attr-dict `:` type($dstMem) `,` type($srcMem)
2259
+ }];
2260
+
2261
+ string llvmBuilder = [{
2262
+ // Arguments to the intrinsic:
2263
+ // dst, src, size, cache_hint,
2264
+ // Flag for cache_hint
2265
+ //
2266
+ llvm::SmallVector<llvm::Value *> translatedOperands;
2267
+ translatedOperands.push_back($dstMem);
2268
+ translatedOperands.push_back($srcMem);
2269
+ translatedOperands.push_back($size);
2270
+
2271
+ // Cachehint, if available
2272
+ llvm::LLVMContext &ctx = moduleTranslation.getLLVMContext();
2273
+ auto *i64Unused = llvm::ConstantInt::get(llvm::Type::getInt64Ty(ctx), 0);
2274
+ bool isCacheHint = op.getL2CacheHint() ? true : false;
2275
+ translatedOperands.push_back(isCacheHint ? $l2CacheHint : i64Unused);
2276
+
2277
+ // Flag argument for cachehint
2278
+ translatedOperands.push_back(builder.getInt1(isCacheHint));
2279
+
2280
+ createIntrinsicCall(builder,
2281
+ llvm::Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global, translatedOperands);
2282
+ }];
2283
+ }
2284
+
2141
2285
//===----------------------------------------------------------------------===//
2142
2286
// NVVM Wgmma Ops
2143
2287
//===----------------------------------------------------------------------===//
0 commit comments