Skip to content

Commit d62d15e

Browse files
authored
[RISCV] Undo unprofitable zext of icmp combine (#134306)
InstCombine will combine this zext of an icmp where the source has a single bit set to a lshr plus trunc (`InstCombinerImpl::transformZExtICmp`): ```llvm define <vscale x 1 x i8> @f(<vscale x 1 x i64> %x) { %1 = and <vscale x 1 x i64> %x, splat (i64 8) %2 = icmp ne <vscale x 1 x i64> %1, splat (i64 0) %3 = zext <vscale x 1 x i1> %2 to <vscale x 1 x i8> ret <vscale x 1 x i8> %3 } ``` ```llvm define <vscale x 1 x i8> @reverse_zexticmp_i64(<vscale x 1 x i64> %x) { %1 = trunc <vscale x 1 x i64> %x to <vscale x 1 x i8> %2 = lshr <vscale x 1 x i8> %1, splat (i8 2) %3 = and <vscale x 1 x i8> %2, splat (i8 1) ret <vscale x 1 x i8> %3 } ``` In a loop, this ends up being unprofitable for RISC-V because the codegen now goes from: ```asm f: # @f .cfi_startproc # %bb.0: vsetvli a0, zero, e64, m1, ta, ma vand.vi v8, v8, 8 vmsne.vi v0, v8, 0 vsetvli zero, zero, e8, mf8, ta, ma vmv.v.i v8, 0 vmerge.vim v8, v8, 1, v0 ret ``` To a series of narrowing vnsrl.wis: ```asm f: # @f .cfi_startproc # %bb.0: vsetvli a0, zero, e64, m1, ta, ma vand.vi v8, v8, 8 vsetvli zero, zero, e32, mf2, ta, ma vnsrl.wi v8, v8, 3 vsetvli zero, zero, e16, mf4, ta, ma vnsrl.wi v8, v8, 0 vsetvli zero, zero, e8, mf8, ta, ma vnsrl.wi v8, v8, 0 ret ``` In the original form, the vmv.v.i is loop invariant and is hoisted out, and the vmerge.vim usually gets folded away into a masked instruction, so you usually just end up with a vsetvli + vmsne.vi. The truncate requires multiple instructions and introduces a vtype toggle for each one, and is measurably slower on the BPI-F3. This reverses the transform in RISCVISelLowering for truncations greater than twice the bitwidth, i.e. it keeps single vnsrl.wis. Fixes #132245
1 parent 30f2e92 commit d62d15e

File tree

2 files changed

+153
-0
lines changed

2 files changed

+153
-0
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15040,6 +15040,70 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
1504015040
return combineTruncSelectToSMaxUSat(N, DAG);
1504115041
}
1504215042

15043+
// InstCombinerImpl::transformZExtICmp will narrow a zext of an icmp with a
15044+
// truncation. But RVV doesn't have truncation instructions for more than twice
15045+
// the bitwidth.
15046+
//
15047+
// E.g. trunc <vscale x 1 x i64> %x to <vscale x 1 x i8> will generate:
15048+
//
15049+
// vsetvli a0, zero, e32, m2, ta, ma
15050+
// vnsrl.wi v12, v8, 0
15051+
// vsetvli zero, zero, e16, m1, ta, ma
15052+
// vnsrl.wi v8, v12, 0
15053+
// vsetvli zero, zero, e8, mf2, ta, ma
15054+
// vnsrl.wi v8, v8, 0
15055+
//
15056+
// So reverse the combine so we generate an vmseq/vmsne again:
15057+
//
15058+
// and (lshr (trunc X), ShAmt), 1
15059+
// -->
15060+
// zext (icmp ne (and X, (1 << ShAmt)), 0)
15061+
//
15062+
// and (lshr (not (trunc X)), ShAmt), 1
15063+
// -->
15064+
// zext (icmp eq (and X, (1 << ShAmt)), 0)
15065+
static SDValue reverseZExtICmpCombine(SDNode *N, SelectionDAG &DAG,
15066+
const RISCVSubtarget &Subtarget) {
15067+
using namespace SDPatternMatch;
15068+
SDLoc DL(N);
15069+
15070+
if (!Subtarget.hasVInstructions())
15071+
return SDValue();
15072+
15073+
EVT VT = N->getValueType(0);
15074+
if (!VT.isVector())
15075+
return SDValue();
15076+
15077+
APInt ShAmt;
15078+
SDValue Inner;
15079+
if (!sd_match(N, m_And(m_OneUse(m_Srl(m_Value(Inner), m_ConstInt(ShAmt))),
15080+
m_One())))
15081+
return SDValue();
15082+
15083+
SDValue X;
15084+
bool IsNot;
15085+
if (sd_match(Inner, m_Not(m_Trunc(m_Value(X)))))
15086+
IsNot = true;
15087+
else if (sd_match(Inner, m_Trunc(m_Value(X))))
15088+
IsNot = false;
15089+
else
15090+
return SDValue();
15091+
15092+
EVT WideVT = X.getValueType();
15093+
if (VT.getScalarSizeInBits() >= WideVT.getScalarSizeInBits() / 2)
15094+
return SDValue();
15095+
15096+
SDValue Res =
15097+
DAG.getNode(ISD::AND, DL, WideVT, X,
15098+
DAG.getConstant(1 << ShAmt.getZExtValue(), DL, WideVT));
15099+
Res = DAG.getSetCC(DL,
15100+
EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15101+
WideVT.getVectorElementCount()),
15102+
Res, DAG.getConstant(0, DL, WideVT),
15103+
IsNot ? ISD::SETEQ : ISD::SETNE);
15104+
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
15105+
}
15106+
1504315107
// Combines two comparison operation and logic operation to one selection
1504415108
// operation(min, max) and logic operation. Returns new constructed Node if
1504515109
// conditions for optimization are satisfied.
@@ -15067,6 +15131,9 @@ static SDValue performANDCombine(SDNode *N,
1506715131
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And);
1506815132
}
1506915133

15134+
if (SDValue V = reverseZExtICmpCombine(N, DAG, Subtarget))
15135+
return V;
15136+
1507015137
if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
1507115138
return V;
1507215139
if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
3+
; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s
4+
5+
; Test that we reverse InstCombinerImpl::transformZExtICmp when unprofitable
6+
7+
define <vscale x 1 x i8> @reverse_zexticmp_i16(<vscale x 1 x i16> %x) {
8+
; CHECK-LABEL: reverse_zexticmp_i16:
9+
; CHECK: # %bb.0:
10+
; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
11+
; CHECK-NEXT: vnsrl.wi v8, v8, 0
12+
; CHECK-NEXT: vsrl.vi v8, v8, 2
13+
; CHECK-NEXT: vand.vi v8, v8, 1
14+
; CHECK-NEXT: ret
15+
%1 = trunc <vscale x 1 x i16> %x to <vscale x 1 x i8>
16+
%2 = lshr <vscale x 1 x i8> %1, splat (i8 2)
17+
%3 = and <vscale x 1 x i8> %2, splat (i8 1)
18+
ret <vscale x 1 x i8> %3
19+
}
20+
21+
define <vscale x 1 x i8> @reverse_zexticmp_i32(<vscale x 1 x i32> %x) {
22+
; CHECK-LABEL: reverse_zexticmp_i32:
23+
; CHECK: # %bb.0:
24+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
25+
; CHECK-NEXT: vand.vi v8, v8, 4
26+
; CHECK-NEXT: vmsne.vi v0, v8, 0
27+
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
28+
; CHECK-NEXT: vmv.v.i v8, 0
29+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
30+
; CHECK-NEXT: ret
31+
%1 = trunc <vscale x 1 x i32> %x to <vscale x 1 x i8>
32+
%2 = lshr <vscale x 1 x i8> %1, splat (i8 2)
33+
%3 = and <vscale x 1 x i8> %2, splat (i8 1)
34+
ret <vscale x 1 x i8> %3
35+
}
36+
37+
define <vscale x 1 x i8> @reverse_zexticmp_neg_i32(<vscale x 1 x i32> %x) {
38+
; CHECK-LABEL: reverse_zexticmp_neg_i32:
39+
; CHECK: # %bb.0:
40+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
41+
; CHECK-NEXT: vand.vi v8, v8, 4
42+
; CHECK-NEXT: vmseq.vi v0, v8, 0
43+
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
44+
; CHECK-NEXT: vmv.v.i v8, 0
45+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
46+
; CHECK-NEXT: ret
47+
%1 = trunc <vscale x 1 x i32> %x to <vscale x 1 x i8>
48+
%2 = xor <vscale x 1 x i8> %1, splat (i8 -1)
49+
%3 = lshr <vscale x 1 x i8> %2, splat (i8 2)
50+
%4 = and <vscale x 1 x i8> %3, splat (i8 1)
51+
ret <vscale x 1 x i8> %4
52+
}
53+
54+
define <vscale x 1 x i8> @reverse_zexticmp_i64(<vscale x 1 x i64> %x) {
55+
; CHECK-LABEL: reverse_zexticmp_i64:
56+
; CHECK: # %bb.0:
57+
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
58+
; CHECK-NEXT: vand.vi v8, v8, 4
59+
; CHECK-NEXT: vmsne.vi v0, v8, 0
60+
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
61+
; CHECK-NEXT: vmv.v.i v8, 0
62+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
63+
; CHECK-NEXT: ret
64+
%1 = trunc <vscale x 1 x i64> %x to <vscale x 1 x i8>
65+
%2 = lshr <vscale x 1 x i8> %1, splat (i8 2)
66+
%3 = and <vscale x 1 x i8> %2, splat (i8 1)
67+
ret <vscale x 1 x i8> %3
68+
}
69+
70+
define <vscale x 1 x i8> @reverse_zexticmp_neg_i64(<vscale x 1 x i64> %x) {
71+
; CHECK-LABEL: reverse_zexticmp_neg_i64:
72+
; CHECK: # %bb.0:
73+
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
74+
; CHECK-NEXT: vand.vi v8, v8, 4
75+
; CHECK-NEXT: vmseq.vi v0, v8, 0
76+
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
77+
; CHECK-NEXT: vmv.v.i v8, 0
78+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
79+
; CHECK-NEXT: ret
80+
%1 = trunc <vscale x 1 x i64> %x to <vscale x 1 x i8>
81+
%2 = xor <vscale x 1 x i8> %1, splat (i8 -1)
82+
%3 = lshr <vscale x 1 x i8> %2, splat (i8 2)
83+
%4 = and <vscale x 1 x i8> %3, splat (i8 1)
84+
ret <vscale x 1 x i8> %4
85+
}
86+

0 commit comments

Comments
 (0)