Skip to content

AMDGPU: Simplify demanded bits on readlane/writeline index arguments #117963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,37 @@ static bool isTriviallyUniform(const Use &U) {
return false;
}

/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
///
/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
IntrinsicInst &II,
unsigned LaneArgIdx) const {
unsigned MaskBits = ST->getWavefrontSizeLog2();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't been following closely, but isn't wave size a tri-state thing now, 32 or 64 or unknown? What does getWavefrontSizeLog2 return if it's unknown?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is and is not a tri-state at the same time. The high level wavesize queries, like getWavefrontSize(Log2) will return 64(6) in the unknown case, which is safe in this context. We have the additional query to check if the wavesize is known precise, and isn't the maximum

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is and is not a tri-state at the same time.

:(

64(6) in the unknown case, which is safe in this context

Agreed.

APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));

KnownBits Known(32);
if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
return true;

if (!Known.isConstant())
return false;

// Out of bounds indexes may appear in wave64 code compiled for wave32.
// Unlike the DAG version, SimplifyDemandedBits does not change constants, so
// manually fix it up.

Value *LaneArg = II.getArgOperand(LaneArgIdx);
Constant *MaskedConst =
ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
if (MaskedConst != LaneArg) {
II.getOperandUse(LaneArgIdx).set(MaskedConst);
return true;
}

return false;
}

std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
Expand Down Expand Up @@ -1092,7 +1123,17 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
const Use &Src = II.getArgOperandUse(0);
if (isTriviallyUniform(Src))
return IC.replaceInstUsesWith(II, Src.get());
break;

if (IID == Intrinsic::amdgcn_readlane &&
simplifyDemandedLaneMaskArg(IC, II, 1))
return &II;

return std::nullopt;
}
case Intrinsic::amdgcn_writelane: {
if (simplifyDemandedLaneMaskArg(IC, II, 1))
return &II;
return std::nullopt;
}
case Intrinsic::amdgcn_trig_preop: {
// The intrinsic is declared with name mangling, but currently the
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {

bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
const Value *Op1, InstCombiner &IC) const;

bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
unsigned LaneAgIdx) const;

std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,45 @@ define i32 @readlane_31(i32 %arg) #0 {
}

define i32 @readlane_32(i32 %arg) #0 {
; CHECK-LABEL: define i32 @readlane_32(
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @readlane_32(
; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @readlane_32(
; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 32)
ret i32 %res
}

define i32 @readlane_33(i32 %arg) #0 {
; CHECK-LABEL: define i32 @readlane_33(
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @readlane_33(
; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @readlane_33(
; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 1)
; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 33)
ret i32 %res
}

define i32 @readlane_63(i32 %arg) #0 {
; CHECK-LABEL: define i32 @readlane_63(
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @readlane_63(
; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @readlane_63(
; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 31)
; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 63)
ret i32 %res
Expand All @@ -50,19 +65,24 @@ define i32 @readlane_63(i32 %arg) #0 {
define i32 @readlane_64(i32 %arg) #0 {
; CHECK-LABEL: define i32 @readlane_64(
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 64)
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
; CHECK-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 64)
ret i32 %res
}

define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
; CHECK-LABEL: define i32 @readlane_and_31(
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @readlane_and_31(
; WAVE64-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @readlane_and_31(
; WAVE32-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
; WAVE32-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 31
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 %idx.clamp)
Expand All @@ -72,8 +92,7 @@ define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
define i32 @readlane_and_63(i32 %arg, i32 %idx) #0 {
; CHECK-LABEL: define i32 @readlane_and_63(
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
; CHECK-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 63
Expand All @@ -92,10 +111,15 @@ define i32 @readlane_poison(i32 %arg) #0 {
}

define float @readlane_f32_63(float %arg) #0 {
; CHECK-LABEL: define float @readlane_f32_63(
; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
; CHECK-NEXT: ret float [[RES]]
; WAVE64-LABEL: define float @readlane_f32_63(
; WAVE64-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
; WAVE64-NEXT: ret float [[RES]]
;
; WAVE32-LABEL: define float @readlane_f32_63(
; WAVE32-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 31)
; WAVE32-NEXT: ret float [[RES]]
;
%res = call float @llvm.amdgcn.readlane.f32(float %arg, i32 63)
ret float %res
Expand All @@ -116,30 +140,45 @@ define i32 @writelane_31(i32 %arg0, i32 %arg1) #0 {
}

define i32 @writelane_32(i32 %arg0, i32 %arg1) #0 {
; CHECK-LABEL: define i32 @writelane_32(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @writelane_32(
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @writelane_32(
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 32, i32 %arg1)
ret i32 %res
}

define i32 @writelane_33(i32 %arg0, i32 %arg1) #0 {
; CHECK-LABEL: define i32 @writelane_33(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @writelane_33(
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @writelane_33(
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 1, i32 [[ARG1]])
; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 33, i32 %arg1)
ret i32 %res
}

define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
; CHECK-LABEL: define i32 @writelane_63(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @writelane_63(
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @writelane_63(
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 31, i32 [[ARG1]])
; WAVE32-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 63, i32 %arg1)
ret i32 %res
Expand All @@ -148,19 +187,24 @@ define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 {
; CHECK-LABEL: define i32 @writelane_64(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 64, i32 [[ARG1]])
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 64, i32 %arg1)
ret i32 %res
}

define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
; CHECK-LABEL: define i32 @writelane_and_31(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
; WAVE64-LABEL: define i32 @writelane_and_31(
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
; WAVE64-NEXT: ret i32 [[RES]]
;
; WAVE32-LABEL: define i32 @writelane_and_31(
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
; WAVE32-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 31
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 %idx.clamp, i32 %arg1)
Expand All @@ -170,8 +214,7 @@ define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
define i32 @writelane_and_63(i32 %arg0, i32 %arg1, i32 %idx) #0 {
; CHECK-LABEL: define i32 @writelane_and_63(
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%idx.clamp = and i32 %idx, 63
Expand All @@ -190,16 +233,18 @@ define i32 @writelane_poison(i32 %arg0, i32 %arg1) #0 {
}

define float @writelane_f32_63(float %arg0, float %arg1) #0 {
; CHECK-LABEL: define float @writelane_f32_63(
; CHECK-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
; CHECK-NEXT: ret float [[RES]]
; WAVE64-LABEL: define float @writelane_f32_63(
; WAVE64-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
; WAVE64-NEXT: ret float [[RES]]
;
; WAVE32-LABEL: define float @writelane_f32_63(
; WAVE32-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 31, float [[ARG1]])
; WAVE32-NEXT: ret float [[RES]]
;
%res = call float @llvm.amdgcn.writelane.f32(float %arg0, i32 63, float %arg1)
ret float %res
}

attributes #0 = { nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; WAVE32: {{.*}}
; WAVE64: {{.*}}
Loading