Skip to content

Commit 35ef4c9

Browse files
committed
[AMDGPU][GlobalISel] Legalize G_ABS
Legalize and select G_ABS so that we can use llvm.abs intrinsic Differential Revision: https://reviews.llvm.org/D102391
1 parent 93a0581 commit 35ef4c9

File tree

6 files changed

+223
-20
lines changed

6 files changed

+223
-20
lines changed

llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,8 @@ class LegalizerHelper {
397397
LegalizeResult lowerSMULH_UMULH(MachineInstr &MI);
398398
LegalizeResult lowerSelect(MachineInstr &MI);
399399
LegalizeResult lowerDIVREM(MachineInstr &MI);
400+
LegalizeResult lowerAbsToAddXor(MachineInstr &MI);
401+
LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI);
400402
};
401403

402404
/// Helper function that creates a libcall to the given \p Name using the given

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,6 +2015,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
20152015
Observer.changedInstr(MI);
20162016
return Legalized;
20172017

2018+
case TargetOpcode::G_ABS:
2019+
Observer.changingInstr(MI);
2020+
widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2021+
widenScalarDst(MI, WideTy);
2022+
Observer.changedInstr(MI);
2023+
return Legalized;
2024+
20182025
case TargetOpcode::G_ADD:
20192026
case TargetOpcode::G_AND:
20202027
case TargetOpcode::G_MUL:
@@ -3200,22 +3207,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
32003207
case G_SSHLSAT:
32013208
case G_USHLSAT:
32023209
return lowerShlSat(MI);
3203-
case G_ABS: {
3204-
// Expand %res = G_ABS %a into:
3205-
// %v1 = G_ASHR %a, scalar_size-1
3206-
// %v2 = G_ADD %a, %v1
3207-
// %res = G_XOR %v2, %v1
3208-
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3209-
Register OpReg = MI.getOperand(1).getReg();
3210-
auto ShiftAmt =
3211-
MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
3212-
auto Shift =
3213-
MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
3214-
auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
3215-
MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
3216-
MI.eraseFromParent();
3217-
return Legalized;
3218-
}
3210+
case G_ABS:
3211+
return lowerAbsToAddXor(MI);
32193212
case G_SELECT:
32203213
return lowerSelect(MI);
32213214
case G_SDIVREM:
@@ -4160,6 +4153,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
41604153
case G_SMAX:
41614154
case G_UMIN:
41624155
case G_UMAX:
4156+
case G_ABS:
41634157
case G_FMINNUM:
41644158
case G_FMAXNUM:
41654159
case G_FMINNUM_IEEE:
@@ -7010,3 +7004,35 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
70107004
MI.eraseFromParent();
70117005
return Legalized;
70127006
}
7007+
7008+
LegalizerHelper::LegalizeResult
7009+
LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7010+
// Expand %res = G_ABS %a into:
7011+
// %v1 = G_ASHR %a, scalar_size-1
7012+
// %v2 = G_ADD %a, %v1
7013+
// %res = G_XOR %v2, %v1
7014+
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7015+
Register OpReg = MI.getOperand(1).getReg();
7016+
auto ShiftAmt =
7017+
MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7018+
auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7019+
auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7020+
MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7021+
MI.eraseFromParent();
7022+
return Legalized;
7023+
}
7024+
7025+
LegalizerHelper::LegalizeResult
7026+
LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7027+
// Expand %res = G_ABS %a into:
7028+
// %v1 = G_CONSTANT 0
7029+
// %v2 = G_SUB %v1, %a
7030+
// %res = G_SMAX %a, %v2
7031+
Register SrcReg = MI.getOperand(1).getReg();
7032+
LLT Ty = MRI.getType(SrcReg);
7033+
auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7034+
auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7035+
MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7036+
MI.eraseFromParent();
7037+
return Legalized;
7038+
}

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -966,7 +966,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
966966
.scalarize(0);
967967

968968
if (ST.hasVOP3PInsts()) {
969-
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
969+
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970970
.legalFor({S32, S16, V2S16})
971971
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972972
.clampMaxNumElements(0, S16, 2)
@@ -975,7 +975,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
975975
.scalarize(0)
976976
.lower();
977977
} else {
978-
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
978+
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979979
.legalFor({S32, S16})
980980
.widenScalarToNextPow2(0)
981981
.minScalar(0, S16)
@@ -994,7 +994,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
994994
.scalarize(0)
995995
.lower();
996996

997-
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
997+
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998998
.legalFor({S32})
999999
.minScalar(0, S32)
10001000
.widenScalarToNextPow2(0)

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2324,6 +2324,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
23242324
MI.eraseFromParent();
23252325
return;
23262326
}
2327+
case AMDGPU::G_ABS: {
2328+
Register SrcReg = MI.getOperand(1).getReg();
2329+
const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2330+
2331+
// There is no VALU abs instruction so we need to replace it with a sub and
2332+
// max combination.
2333+
if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2334+
MachineFunction *MF = MI.getParent()->getParent();
2335+
ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2336+
MachineIRBuilder B(MI, Apply);
2337+
LegalizerHelper Helper(*MF, Apply, B);
2338+
2339+
if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2340+
llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2341+
return;
2342+
}
2343+
LLVM_FALLTHROUGH;
2344+
}
23272345
case AMDGPU::G_ADD:
23282346
case AMDGPU::G_SUB:
23292347
case AMDGPU::G_MUL:
@@ -3508,6 +3526,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
35083526
case AMDGPU::G_SMAX:
35093527
case AMDGPU::G_UMIN:
35103528
case AMDGPU::G_UMAX:
3529+
case AMDGPU::G_ABS:
35113530
case AMDGPU::G_SHUFFLE_VECTOR:
35123531
if (isSALUMapping(MI))
35133532
return getDefaultMappingSOP(MI);

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,9 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
294294
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
295295

296296
let Defs = [SCC] in {
297-
def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
297+
def S_ABS_I32 : SOP1_32 <"s_abs_i32",
298+
[(set i32:$sdst, (abs i32:$src0))]
299+
>;
298300
} // End Defs = [SCC]
299301

300302
let SubtargetPredicate = HasVGPRIndexMode in {
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
3+
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
4+
5+
declare i16 @llvm.abs.i16(i16, i1)
6+
declare i32 @llvm.abs.i32(i32, i1)
7+
declare i64 @llvm.abs.i64(i64, i1)
8+
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
9+
10+
define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
11+
; GFX-LABEL: abs_sgpr_i16:
12+
; GFX: ; %bb.0:
13+
; GFX-NEXT: s_sext_i32_i16 s0, s0
14+
; GFX-NEXT: s_abs_i32 s0, s0
15+
; GFX-NEXT: ; return to shader part epilog
16+
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
17+
ret i16 %res
18+
}
19+
20+
define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
21+
; GFX-LABEL: abs_sgpr_i32:
22+
; GFX: ; %bb.0:
23+
; GFX-NEXT: s_abs_i32 s0, s0
24+
; GFX-NEXT: ; return to shader part epilog
25+
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
26+
ret i32 %res
27+
}
28+
29+
define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
30+
; GFX-LABEL: abs_sgpr_i64:
31+
; GFX: ; %bb.0:
32+
; GFX-NEXT: s_ashr_i32 s2, s1, 31
33+
; GFX-NEXT: s_add_u32 s0, s0, s2
34+
; GFX-NEXT: s_cselect_b32 s4, 1, 0
35+
; GFX-NEXT: s_and_b32 s4, s4, 1
36+
; GFX-NEXT: s_cmp_lg_u32 s4, 0
37+
; GFX-NEXT: s_mov_b32 s3, s2
38+
; GFX-NEXT: s_addc_u32 s1, s1, s2
39+
; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
40+
; GFX-NEXT: ; return to shader part epilog
41+
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
42+
ret i64 %res
43+
}
44+
45+
define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
46+
; GFX-LABEL: abs_sgpr_v4i32:
47+
; GFX: ; %bb.0:
48+
; GFX-NEXT: s_abs_i32 s0, s0
49+
; GFX-NEXT: s_abs_i32 s1, s1
50+
; GFX-NEXT: s_abs_i32 s2, s2
51+
; GFX-NEXT: s_abs_i32 s3, s3
52+
; GFX-NEXT: ; return to shader part epilog
53+
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
54+
ret <4 x i32> %res
55+
}
56+
57+
define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
58+
; GFX6-LABEL: abs_vgpr_i16:
59+
; GFX6: ; %bb.0:
60+
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
61+
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
62+
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
63+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
64+
; GFX6-NEXT: ; return to shader part epilog
65+
;
66+
; GFX8-LABEL: abs_vgpr_i16:
67+
; GFX8: ; %bb.0:
68+
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
69+
; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
70+
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
71+
; GFX8-NEXT: ; return to shader part epilog
72+
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
73+
ret i16 %res
74+
}
75+
76+
define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
77+
; GFX6-LABEL: abs_vgpr_i32:
78+
; GFX6: ; %bb.0:
79+
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
80+
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
81+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
82+
; GFX6-NEXT: ; return to shader part epilog
83+
;
84+
; GFX8-LABEL: abs_vgpr_i32:
85+
; GFX8: ; %bb.0:
86+
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
87+
; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
88+
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
89+
; GFX8-NEXT: ; return to shader part epilog
90+
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
91+
ret i32 %res
92+
}
93+
94+
define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
95+
; GFX6-LABEL: abs_vgpr_i64:
96+
; GFX6: ; %bb.0:
97+
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
98+
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
99+
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
100+
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
101+
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
102+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
103+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
104+
; GFX6-NEXT: ; return to shader part epilog
105+
;
106+
; GFX8-LABEL: abs_vgpr_i64:
107+
; GFX8: ; %bb.0:
108+
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
109+
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
110+
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
111+
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
112+
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
113+
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
114+
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
115+
; GFX8-NEXT: ; return to shader part epilog
116+
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
117+
ret i64 %res
118+
}
119+
120+
define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
121+
; GFX6-LABEL: abs_vgpr_v4i32:
122+
; GFX6: ; %bb.0:
123+
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
124+
; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
125+
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
126+
; GFX6-NEXT: v_max_i32_e32 v1, v1, v4
127+
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
128+
; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
129+
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
130+
; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
131+
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
132+
; GFX6-NEXT: v_readfirstlane_b32 s1, v1
133+
; GFX6-NEXT: v_readfirstlane_b32 s2, v2
134+
; GFX6-NEXT: v_readfirstlane_b32 s3, v3
135+
; GFX6-NEXT: ; return to shader part epilog
136+
;
137+
; GFX8-LABEL: abs_vgpr_v4i32:
138+
; GFX8: ; %bb.0:
139+
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
140+
; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
141+
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
142+
; GFX8-NEXT: v_max_i32_e32 v1, v1, v4
143+
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
144+
; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
145+
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
146+
; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
147+
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
148+
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
149+
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
150+
; GFX8-NEXT: v_readfirstlane_b32 s3, v3
151+
; GFX8-NEXT: ; return to shader part epilog
152+
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
153+
ret <4 x i32> %res
154+
}

0 commit comments

Comments
 (0)