-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Simplify demanded bits on readlane/writeline index arguments #117963
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Simplify demanded bits on readlane/writeline index arguments #117963
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThe main goal is to fold away wave64 code when compiled for wave32. Full diff: https://github.com/llvm/llvm-project/pull/117963.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 18a09c39a06387..a0bb3e181ac526 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -450,6 +450,37 @@ static bool isTriviallyUniform(const Use &U) {
return false;
}
+/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
+///
+/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
+bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
+ IntrinsicInst &II,
+ unsigned LaneArgIdx) const {
+ unsigned MaskBits = ST->isWaveSizeKnown() && ST->isWave32() ? 5 : 6;
+ APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
+
+ KnownBits Known(32);
+ if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
+ return true;
+
+ if (!Known.isConstant())
+ return false;
+
+ // Unlike the DAG version, SimplifyDemandedBits does not change
+ // constants. Make sure we clamp these down. Out of bounds indexes may appear
+ // in wave64 code compiled for wave32.
+
+ Value *LaneArg = II.getArgOperand(LaneArgIdx);
+ Constant *MaskedConst =
+ ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
+ if (MaskedConst != LaneArg) {
+ II.getOperandUse(LaneArgIdx).set(MaskedConst);
+ return true;
+ }
+
+ return false;
+}
+
std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1092,7 +1123,17 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
const Use &Src = II.getArgOperandUse(0);
if (isTriviallyUniform(Src))
return IC.replaceInstUsesWith(II, Src.get());
- break;
+
+ if (IID == Intrinsic::amdgcn_readlane &&
+ simplifyDemandedLaneMaskArg(IC, II, 1))
+ return &II;
+
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_writelane: {
+ if (simplifyDemandedLaneMaskArg(IC, II, 1))
+ return &II;
+ return std::nullopt;
}
case Intrinsic::amdgcn_trig_preop: {
// The intrinsic is declared with name mangling, but currently the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 10956861650ab3..585f38fc02c29c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -220,6 +220,10 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
const Value *Op1, InstCombiner &IC) const;
+
+ bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
+ unsigned LaneAgIdx) const;
+
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll b/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll
index b686f447b8d3c9..327d68bdf550e4 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll
@@ -18,30 +18,45 @@ define i32 @readlane_31(i32 %arg) #0 {
}
define i32 @readlane_32(i32 %arg) #0 {
-; CHECK-LABEL: define i32 @readlane_32(
-; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_32(
+; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_32(
+; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 32)
ret i32 %res
}
define i32 @readlane_33(i32 %arg) #0 {
-; CHECK-LABEL: define i32 @readlane_33(
-; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_33(
+; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_33(
+; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 1)
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 33)
ret i32 %res
}
define i32 @readlane_63(i32 %arg) #0 {
-; CHECK-LABEL: define i32 @readlane_63(
-; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_63(
+; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_63(
+; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 31)
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 63)
ret i32 %res
@@ -50,7 +65,7 @@ define i32 @readlane_63(i32 %arg) #0 {
define i32 @readlane_64(i32 %arg) #0 {
; CHECK-LABEL: define i32 @readlane_64(
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 64)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
; CHECK-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 64)
@@ -58,11 +73,16 @@ define i32 @readlane_64(i32 %arg) #0 {
}
define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
-; CHECK-LABEL: define i32 @readlane_and_31(
-; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @readlane_and_31(
+; WAVE64-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @readlane_and_31(
+; WAVE32-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 31
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 %idx.clamp)
@@ -72,8 +92,7 @@ define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
define i32 @readlane_and_63(i32 %arg, i32 %idx) #0 {
; CHECK-LABEL: define i32 @readlane_and_63(
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
; CHECK-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 63
@@ -92,10 +111,15 @@ define i32 @readlane_poison(i32 %arg) #0 {
}
define float @readlane_f32_63(float %arg) #0 {
-; CHECK-LABEL: define float @readlane_f32_63(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
-; CHECK-NEXT: ret float [[RES]]
+; WAVE64-LABEL: define float @readlane_f32_63(
+; WAVE64-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
+; WAVE64-NEXT: ret float [[RES]]
+;
+; WAVE32-LABEL: define float @readlane_f32_63(
+; WAVE32-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 31)
+; WAVE32-NEXT: ret float [[RES]]
;
%res = call float @llvm.amdgcn.readlane.f32(float %arg, i32 63)
ret float %res
@@ -116,30 +140,45 @@ define i32 @writelane_31(i32 %arg0, i32 %arg1) #0 {
}
define i32 @writelane_32(i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: define i32 @writelane_32(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_32(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_32(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 32, i32 %arg1)
ret i32 %res
}
define i32 @writelane_33(i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: define i32 @writelane_33(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_33(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_33(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 1, i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 33, i32 %arg1)
ret i32 %res
}
define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: define i32 @writelane_63(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_63(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_63(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 31, i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 63, i32 %arg1)
ret i32 %res
@@ -148,7 +187,7 @@ define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 {
; CHECK-LABEL: define i32 @writelane_64(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 64, i32 [[ARG1]])
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 64, i32 %arg1)
@@ -156,11 +195,16 @@ define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 {
}
define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
-; CHECK-LABEL: define i32 @writelane_and_31(
-; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
-; CHECK-NEXT: ret i32 [[RES]]
+; WAVE64-LABEL: define i32 @writelane_and_31(
+; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
+; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
+; WAVE64-NEXT: ret i32 [[RES]]
+;
+; WAVE32-LABEL: define i32 @writelane_and_31(
+; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
+; WAVE32-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 31
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 %idx.clamp, i32 %arg1)
@@ -170,8 +214,7 @@ define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
define i32 @writelane_and_63(i32 %arg0, i32 %arg1, i32 %idx) #0 {
; CHECK-LABEL: define i32 @writelane_and_63(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 63
@@ -190,16 +233,18 @@ define i32 @writelane_poison(i32 %arg0, i32 %arg1) #0 {
}
define float @writelane_f32_63(float %arg0, float %arg1) #0 {
-; CHECK-LABEL: define float @writelane_f32_63(
-; CHECK-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
-; CHECK-NEXT: ret float [[RES]]
+; WAVE64-LABEL: define float @writelane_f32_63(
+; WAVE64-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
+; WAVE64-NEXT: ret float [[RES]]
+;
+; WAVE32-LABEL: define float @writelane_f32_63(
+; WAVE32-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
+; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 31, float [[ARG1]])
+; WAVE32-NEXT: ret float [[RES]]
;
%res = call float @llvm.amdgcn.writelane.f32(float %arg0, i32 63, float %arg1)
ret float %res
}
attributes #0 = { nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; WAVE32: {{.*}}
-; WAVE64: {{.*}}
|
Should mention that in the comment in the code. It was not clear to me why you would want to clamp constants. |
3363529
to
bd8f0b9
Compare
4c11c64
to
5a617f8
Compare
The main goal is to fold away wave64 code when compiled for wave32. If we have out of bounds indexing, these will now clamp down to a low bit which may CSE with the operations on the low half of the wave.
5a617f8
to
45498a1
Compare
ping |
bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC, | ||
IntrinsicInst &II, | ||
unsigned LaneArgIdx) const { | ||
unsigned MaskBits = ST->getWavefrontSizeLog2(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't been following closely, but isn't wave size a tri-state thing now, 32 or 64 or unknown? What does getWavefrontSizeLog2 return if it's unknown?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is and is not a tri-state at the same time. The high level wavesize queries, like getWavefrontSize(Log2) will return 64(6) in the unknown case, which is safe in this context. We have the additional query to check if the wavesize is known precise, and isn't the maximum
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is and is not a tri-state at the same time.
:(
64(6) in the unknown case, which is safe in this context
Agreed.
The main goal is to fold away wave64 code when compiled for wave32.
If we have out of bounds indexing, these will now clamp down to
a low bit which may CSE with the operations on the low half of the
wave.