-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] enforce signed 32 bit type for immediate offset #99682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesThe NVPTX ISA states that an immOff must be a signed 32-bit (https://docs.nvidia.com/cuda/parallel-thread-execution/#addresses-as-operands): > Currently we do not consider this constraint, meaning that in some edge cases we generate invalid PTX when a value is offset by a very large immediate. Full diff: https://github.com/llvm/llvm-project/pull/99682.diff 4 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 371ec8596ef63..9fcacae9f7d45 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3914,8 +3914,13 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
else
Base = Addr.getOperand(0);
- Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
- mvt);
+
+ // Offset is a signed i32 constant in PTX [register+offset] address mode
+ if (!CN->getAPIntValue().isSignedIntN(32))
+ return false;
+
+ Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode),
+ MVT::i32);
return true;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bc23998455a68..00293d9d67401 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5167,9 +5167,12 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
// - [areg+immoff]
// - [immAddr]
- if (AM.BaseGV) {
+ // immoff is signed 32-bit
+ if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
+ return false;
+
+ if (AM.BaseGV)
return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
- }
switch (AM.Scale) {
case 0: // "r", "r+i" or "i" is allowed
diff --git a/llvm/test/CodeGen/NVPTX/addr-mode.ll b/llvm/test/CodeGen/NVPTX/addr-mode.ll
new file mode 100644
index 0000000000000..a6a085c0e2e33
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/addr-mode.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_addr_mode_i64(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %addr = getelementptr i32, ptr %x, i64 -1
+ %res = load i32, ptr %addr
+ ret i32 %res
+}
+
+define i32 @test_addr_mode_i32(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i32_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %addr = getelementptr i32, ptr %x, i32 -1
+ %res = load i32, ptr %addr
+ ret i32 %res
+}
+
+define i32 @test_addr_mode_i16(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i16_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %addr = getelementptr i32, ptr %x, i16 -1
+ %res = load i32, ptr %addr
+ ret i32 %res
+}
+
+define i32 @test_addr_mode_i8(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i8_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %addr = getelementptr i32, ptr %x, i8 -1
+ %res = load i32, ptr %addr
+ ret i32 %res
+}
+
+define i32 @test_addr_mode_i64_large(ptr %x) {
+; CHECK-LABEL: test_addr_mode_i64_large(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_large_param_0];
+; CHECK-NEXT: add.s64 %rd2, %rd1, 17179869172;
+; CHECK-NEXT: ld.u32 %r1, [%rd2];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %addr = getelementptr i32, ptr %x, i64 4294967293
+ %res = load i32, ptr %addr
+ ret i32 %res
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index c915b9a5e59ac..16e9e5eeb6143 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -397,3 +397,27 @@ entry:
%ptr2 = getelementptr inbounds %struct0, ptr %ptr, i65 1, i32 3, i64 %idx, i32 1
ret ptr %ptr2
}
+
+; Do not extract large constant offset that cannot be folded in to PTX
+; addressing mode
+define void @large_offset(ptr %out, i32 %in) {
+; CHECK-LABEL: define void @large_offset(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[IN:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[TMP0]], 536870912
+; CHECK-NEXT: [[IDX:%.*]] = zext nneg i32 [[ADD]] to i64
+; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IDX]]
+; CHECK-NEXT: store i32 [[IN]], ptr [[GETELEM]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %add = add nuw nsw i32 %0, 536870912
+ %idx = zext nneg i32 %add to i64
+ %getElem = getelementptr inbounds i32, ptr %out, i64 %idx
+ store i32 %in, ptr %getElem, align 4
+ ret void
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
So, it's not that we want the offset to always be a 32-bit integer, but rather that we need the value to fit in a 32-bit integer. E.g. it would be fine if someone would pass an |
Summary: The NVPTX ISA states that an immOff must fit in a signed 32-bit integer (https://docs.nvidia.com/cuda/parallel-thread-execution/#addresses-as-operands): > `[reg+immOff]` > > a sum of register `reg` containing a byte address plus a constant > integer byte offset (signed, 32-bit). > > `[var+immOff]` > > a sum of address of addressable variable `var` containing a byte > address plus a constant integer byte offset (signed, 32-bit). Currently we do not consider this constraint, meaning that in some edge cases we generate invalid PTX when a value is offset by a very large immediate. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251391
The NVPTX ISA states that an immOff must fit in a signed 32-bit integer (https://docs.nvidia.com/cuda/parallel-thread-execution/#addresses-as-operands):
Currently we do not consider this constraint, meaning that in some edge cases we generate invalid PTX when a value is offset by a very large immediate.