-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[mlir][GPU] Plumb range information through the NVVM lowerings #107659
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
#include "mlir/Dialect/LLVMIR/NVVMDialect.h" | ||
#include "mlir/Dialect/Math/IR/Math.h" | ||
#include "mlir/Dialect/MemRef/IR/MemRef.h" | ||
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" | ||
#include "mlir/Transforms/DialectConversion.h" | ||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h" | ||
|
||
|
@@ -209,7 +210,15 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> { | |
ConversionPatternRewriter &rewriter) const override { | ||
auto loc = op->getLoc(); | ||
MLIRContext *context = rewriter.getContext(); | ||
Value newOp = rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type()); | ||
LLVM::ConstantRangeAttr bounds = nullptr; | ||
if (std::optional<APInt> upperBound = op.getUpperBound()) | ||
bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>( | ||
/*bitWidth=*/32, /*lower=*/0, upperBound->getZExtValue()); | ||
else | ||
bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>( | ||
/*bitWidth=*/32, /*lower=*/0, /*upper=*/kWarpSize); | ||
Value newOp = | ||
rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we always use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks for a good observation about the default |
||
// Truncate or extend the result depending on the index bitwidth specified | ||
// by the LLVMTypeConverter options. | ||
const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); | ||
|
@@ -340,27 +349,40 @@ void mlir::populateGpuSubgroupReduceOpLoweringPattern( | |
|
||
void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, | ||
RewritePatternSet &patterns) { | ||
using gpu::index_lowering::IndexKind; | ||
using gpu::index_lowering::IntrType; | ||
populateWithGenerated(patterns); | ||
patterns.add<GPUPrintfOpToVPrintfLowering>(converter); | ||
patterns.add< | ||
gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp, | ||
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>, | ||
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>( | ||
converter, IndexKind::Block, IntrType::Id); | ||
patterns.add< | ||
gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp, | ||
NVVM::BlockDimYOp, NVVM::BlockDimZOp>, | ||
NVVM::BlockDimYOp, NVVM::BlockDimZOp>>( | ||
converter, IndexKind::Block, IntrType::Dim); | ||
patterns.add< | ||
gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp, | ||
NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>, | ||
gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp, | ||
NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>, | ||
gpu::index_lowering::OpLowering< | ||
gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp, | ||
NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>, | ||
gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp, | ||
NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>, | ||
gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp, | ||
NVVM::BlockIdYOp, NVVM::BlockIdZOp>, | ||
gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp, | ||
NVVM::GridDimYOp, NVVM::GridDimZOp>, | ||
GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter); | ||
NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>( | ||
converter, IndexKind::Other, IntrType::Id); | ||
patterns.add<gpu::index_lowering::OpLowering< | ||
gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp, | ||
NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim); | ||
patterns.add<gpu::index_lowering::OpLowering< | ||
gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp, | ||
NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>( | ||
converter, IndexKind::Other, IntrType::Id); | ||
patterns.add<gpu::index_lowering::OpLowering< | ||
gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp, | ||
NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim); | ||
patterns.add<gpu::index_lowering::OpLowering< | ||
gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>( | ||
converter, IndexKind::Block, IntrType::Id); | ||
patterns.add<gpu::index_lowering::OpLowering< | ||
gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>( | ||
converter, IndexKind::Grid, IntrType::Dim); | ||
patterns.add<GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>( | ||
converter); | ||
|
||
patterns.add<GPUDynamicSharedMemoryOpLowering>( | ||
converter, NVVM::kSharedMemoryAlignmentBit); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,7 +50,7 @@ gpu.module @test_module_0 { | |
%gDimZ = gpu.grid_dim z | ||
|
||
|
||
// CHECK: = nvvm.read.ptx.sreg.laneid : i32 | ||
// CHECK: = nvvm.read.ptx.sreg.laneid range <i32, 0, 32> : i32 | ||
// CHECK: = llvm.sext %{{.*}} : i32 to i64 | ||
%laneId = gpu.lane_id | ||
|
||
|
@@ -699,9 +699,21 @@ gpu.module @test_module_32 { | |
} | ||
|
||
gpu.module @test_module_33 { | ||
// CHECK-LABEL: func @kernel_with_block_size() | ||
// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>} | ||
gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} { | ||
// CHECK-LABEL: func @kernel_with_block_size( | ||
// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 4, 2>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 4, 2>} | ||
gpu.func @kernel_with_block_size(%arg0: !llvm.ptr) kernel attributes {known_block_size = array<i32: 32, 4, 2>} { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I remember correctly, you added known_block_size to func.func. So I am wondering is this PR going to work for func.func? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, that code works generally |
||
// CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32 | ||
%0 = gpu.thread_id x | ||
// CHECK: = nvvm.read.ptx.sreg.tid.y range <i32, 0, 4> : i32 | ||
%1 = gpu.thread_id y | ||
// CHECK: = nvvm.read.ptx.sreg.tid.z range <i32, 0, 2> : i32 | ||
%2 = gpu.thread_id z | ||
|
||
// Fake usage to prevent dead code elimination | ||
%3 = arith.addi %0, %1 : index | ||
%4 = arith.addi %3, %2 : index | ||
%5 = arith.index_cast %4 : index to i64 | ||
llvm.store %5, %arg0 : i64, !llvm.ptr | ||
gpu.return | ||
} | ||
} | ||
|
@@ -917,6 +929,20 @@ gpu.module @test_module_48 { | |
} | ||
} | ||
|
||
gpu.module @test_module_49 { | ||
// CHECK-LABEL: func @explicit_id_bounds() | ||
func.func @explicit_id_bounds() -> (index, index, index) { | ||
// CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32 | ||
%0 = gpu.thread_id x upper_bound 32 | ||
// CHECK: = nvvm.read.ptx.sreg.ntid.x range <i32, 1, 33> : i32 | ||
%1 = gpu.block_dim x upper_bound 32 | ||
durga4github marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// CHECK: = nvvm.read.ptx.sreg.laneid range <i32, 0, 16> : i32 | ||
%2 = gpu.lane_id upper_bound 16 | ||
|
||
return %0, %1, %2 : index, index, index | ||
} | ||
} | ||
|
||
module attributes {transform.with_named_sequence} { | ||
transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) { | ||
%gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
who is setting the upperbound? I might be missing something
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
User code - I'll have some tests shortly