Skip to content

Commit d5521d1

Browse files
[DAG] Reducing instructions by better legalization handling of AVGFLOORU for illegal data types (#99913)
**Issue:** rust-lang/rust#124790 **Previous PR:** #99614 https://rust.godbolt.org/z/T7eKP3Tvo **Aarch64:** https://alive2.llvm.org/ce/z/dqr2Kg **x86:** https://alive2.llvm.org/ce/z/ze88Hw cc: @RKSimon @topperc
1 parent 2402b32 commit d5521d1

File tree

5 files changed

+329
-60
lines changed

5 files changed

+329
-60
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9379,6 +9379,26 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
93799379
}
93809380
}
93819381

9382+
// avgflooru(lhs, rhs) -> or(lshr(add(lhs, rhs),1),shl(overflow, typesize-1))
9383+
if (Opc == ISD::AVGFLOORU && VT.isScalarInteger() && !isTypeLegal(VT)) {
9384+
SDValue UAddWithOverflow =
9385+
DAG.getNode(ISD::UADDO, dl, DAG.getVTList(VT, MVT::i1), {RHS, LHS});
9386+
9387+
SDValue Sum = UAddWithOverflow.getValue(0);
9388+
SDValue Overflow = UAddWithOverflow.getValue(1);
9389+
9390+
// Right shift the sum by 1
9391+
SDValue One = DAG.getShiftAmountConstant(1, VT, dl);
9392+
SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One);
9393+
9394+
SDValue ZeroExtOverflow = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Overflow);
9395+
SDValue OverflowShl =
9396+
DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow,
9397+
DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
9398+
9399+
return DAG.getNode(ISD::OR, dl, VT, LShrVal, OverflowShl);
9400+
}
9401+
93829402
// avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1))
93839403
// avgceilu(lhs, rhs) -> sub(or(lhs,rhs),lshr(xor(lhs,rhs),1))
93849404
// avgfloors(lhs, rhs) -> add(and(lhs,rhs),ashr(xor(lhs,rhs),1))
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define i128 @avgflooru_i128(i128 %x, i128 %y) {
5+
; CHECK-LABEL: avgflooru_i128:
6+
; CHECK: // %bb.0: // %start
7+
; CHECK-NEXT: adds x8, x0, x2
8+
; CHECK-NEXT: adcs x9, x1, x3
9+
; CHECK-NEXT: cset w10, hs
10+
; CHECK-NEXT: extr x0, x9, x8, #1
11+
; CHECK-NEXT: extr x1, x10, x9, #1
12+
; CHECK-NEXT: ret
13+
start:
14+
%xor = xor i128 %y, %x
15+
%lshr = lshr i128 %xor, 1
16+
%and = and i128 %y, %x
17+
%add = add i128 %lshr, %and
18+
ret i128 %add
19+
}
20+
21+
declare void @use(i8)
22+
23+
define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind {
24+
; CHECK-LABEL: avgflooru_i128_multi_use:
25+
; CHECK: // %bb.0: // %start
26+
; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill
27+
; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
28+
; CHECK-NEXT: eor x23, x3, x1
29+
; CHECK-NEXT: eor x24, x2, x0
30+
; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
31+
; CHECK-NEXT: mov x21, x1
32+
; CHECK-NEXT: mov x22, x0
33+
; CHECK-NEXT: mov x0, x24
34+
; CHECK-NEXT: mov x1, x23
35+
; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
36+
; CHECK-NEXT: mov x19, x3
37+
; CHECK-NEXT: mov x20, x2
38+
; CHECK-NEXT: bl use
39+
; CHECK-NEXT: extr x0, x23, x24, #1
40+
; CHECK-NEXT: lsr x1, x23, #1
41+
; CHECK-NEXT: bl use
42+
; CHECK-NEXT: adds x8, x22, x20
43+
; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
44+
; CHECK-NEXT: adcs x9, x21, x19
45+
; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
46+
; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
47+
; CHECK-NEXT: cset w10, hs
48+
; CHECK-NEXT: extr x0, x9, x8, #1
49+
; CHECK-NEXT: extr x1, x10, x9, #1
50+
; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload
51+
; CHECK-NEXT: ret
52+
start:
53+
%xor = xor i128 %y, %x
54+
call void @use(i128 %xor)
55+
%lshr = lshr i128 %xor, 1
56+
call void @use(i128 %lshr)
57+
%and = and i128 %y, %x
58+
%add = add i128 %lshr, %and
59+
ret i128 %add
60+
}
61+
62+
; the 'avgflooru_i128_negative` shouldn't combine because it's not
63+
; an avgflooru operation, which is what we're targeting
64+
65+
define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
66+
; CHECK-LABEL: avgflooru_i128_negative:
67+
; CHECK: // %bb.0: // %start
68+
; CHECK-NEXT: mvn x8, x0
69+
; CHECK-NEXT: and x9, x2, x0
70+
; CHECK-NEXT: mvn x10, x1
71+
; CHECK-NEXT: and x11, x3, x1
72+
; CHECK-NEXT: adds x0, x8, x9
73+
; CHECK-NEXT: adc x1, x10, x11
74+
; CHECK-NEXT: ret
75+
start:
76+
%xor = xor i128 %x, -1
77+
%and = and i128 %y, %x
78+
%add = add i128 %xor, %and
79+
ret i128 %add
80+
}
81+
82+
; This negative test case shouldn't work, i32 is already properly
83+
; handled in terms of legalization, compared to the i128
84+
85+
define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
86+
; CHECK-LABEL: avgflooru_i128_negative2:
87+
; CHECK: // %bb.0: // %start
88+
; CHECK-NEXT: mov w8, w1
89+
; CHECK-NEXT: add x8, x8, w0, uxtw
90+
; CHECK-NEXT: lsr x0, x8, #1
91+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
92+
; CHECK-NEXT: ret
93+
start:
94+
%xor = xor i32 %y, %x
95+
%lshr = lshr i32 %xor, 1
96+
%and = and i32 %y, %x
97+
%add = add i32 %lshr, %and
98+
ret i32 %add
99+
}
100+
101+
define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
102+
; CHECK-LABEL: avgflooru_i128_vec:
103+
; CHECK: // %bb.0: // %start
104+
; CHECK-NEXT: adds x8, x0, x4
105+
; CHECK-NEXT: adcs x9, x1, x5
106+
; CHECK-NEXT: cset w10, hs
107+
; CHECK-NEXT: adds x11, x2, x6
108+
; CHECK-NEXT: extr x0, x9, x8, #1
109+
; CHECK-NEXT: adcs x12, x3, x7
110+
; CHECK-NEXT: extr x1, x10, x9, #1
111+
; CHECK-NEXT: extr x11, x12, x11, #1
112+
; CHECK-NEXT: cset w13, hs
113+
; CHECK-NEXT: extr x3, x13, x12, #1
114+
; CHECK-NEXT: fmov d0, x11
115+
; CHECK-NEXT: mov v0.d[1], x3
116+
; CHECK-NEXT: fmov x2, d0
117+
; CHECK-NEXT: ret
118+
start:
119+
%xor = xor <2 x i128> %y, %x
120+
%lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
121+
%and = and <2 x i128> %y, %x
122+
%add = add <2 x i128> %lshr, %and
123+
ret <2 x i128> %add
124+
}

llvm/test/CodeGen/RISCV/avgflooru.ll

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -164,18 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
164164
define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
165165
; RV32I-LABEL: test_fixed_i64:
166166
; RV32I: # %bb.0:
167-
; RV32I-NEXT: and a4, a1, a3
168-
; RV32I-NEXT: xor a1, a1, a3
169-
; RV32I-NEXT: srli a3, a1, 1
170-
; RV32I-NEXT: add a3, a4, a3
171-
; RV32I-NEXT: slli a1, a1, 31
172-
; RV32I-NEXT: xor a4, a0, a2
173-
; RV32I-NEXT: srli a4, a4, 1
174-
; RV32I-NEXT: or a1, a4, a1
175-
; RV32I-NEXT: and a2, a0, a2
176-
; RV32I-NEXT: add a0, a2, a1
167+
; RV32I-NEXT: add a4, a3, a1
168+
; RV32I-NEXT: add a0, a2, a0
177169
; RV32I-NEXT: sltu a1, a0, a2
178-
; RV32I-NEXT: add a1, a3, a1
170+
; RV32I-NEXT: add a2, a4, a1
171+
; RV32I-NEXT: beq a2, a3, .LBB6_2
172+
; RV32I-NEXT: # %bb.1:
173+
; RV32I-NEXT: sltu a1, a2, a3
174+
; RV32I-NEXT: .LBB6_2:
175+
; RV32I-NEXT: slli a1, a1, 31
176+
; RV32I-NEXT: srli a3, a2, 1
177+
; RV32I-NEXT: or a1, a3, a1
178+
; RV32I-NEXT: slli a2, a2, 31
179+
; RV32I-NEXT: srli a0, a0, 1
180+
; RV32I-NEXT: or a0, a0, a2
179181
; RV32I-NEXT: ret
180182
;
181183
; RV64I-LABEL: test_fixed_i64:
@@ -195,18 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
195197
define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
196198
; RV32I-LABEL: test_ext_i64:
197199
; RV32I: # %bb.0:
198-
; RV32I-NEXT: and a4, a1, a3
199-
; RV32I-NEXT: xor a1, a1, a3
200-
; RV32I-NEXT: srli a3, a1, 1
201-
; RV32I-NEXT: add a3, a4, a3
202-
; RV32I-NEXT: slli a1, a1, 31
203-
; RV32I-NEXT: xor a4, a0, a2
204-
; RV32I-NEXT: srli a4, a4, 1
205-
; RV32I-NEXT: or a1, a4, a1
206-
; RV32I-NEXT: and a2, a0, a2
207-
; RV32I-NEXT: add a0, a2, a1
200+
; RV32I-NEXT: add a4, a3, a1
201+
; RV32I-NEXT: add a0, a2, a0
208202
; RV32I-NEXT: sltu a1, a0, a2
209-
; RV32I-NEXT: add a1, a3, a1
203+
; RV32I-NEXT: add a2, a4, a1
204+
; RV32I-NEXT: beq a2, a3, .LBB7_2
205+
; RV32I-NEXT: # %bb.1:
206+
; RV32I-NEXT: sltu a1, a2, a3
207+
; RV32I-NEXT: .LBB7_2:
208+
; RV32I-NEXT: slli a1, a1, 31
209+
; RV32I-NEXT: srli a3, a2, 1
210+
; RV32I-NEXT: or a1, a3, a1
211+
; RV32I-NEXT: slli a2, a2, 31
212+
; RV32I-NEXT: srli a0, a0, 1
213+
; RV32I-NEXT: or a0, a0, a2
210214
; RV32I-NEXT: ret
211215
;
212216
; RV64I-LABEL: test_ext_i64:
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
3+
4+
define i128 @avgflooru_i128(i128 %x, i128 %y) {
5+
; CHECK-LABEL: avgflooru_i128:
6+
; CHECK: # %bb.0: # %start
7+
; CHECK-NEXT: movq %rdi, %rax
8+
; CHECK-NEXT: addq %rdx, %rax
9+
; CHECK-NEXT: adcq %rcx, %rsi
10+
; CHECK-NEXT: setb %cl
11+
; CHECK-NEXT: shrdq $1, %rsi, %rax
12+
; CHECK-NEXT: movzbl %cl, %edx
13+
; CHECK-NEXT: shldq $63, %rsi, %rdx
14+
; CHECK-NEXT: retq
15+
start:
16+
%xor = xor i128 %y, %x
17+
%lshr = lshr i128 %xor, 1
18+
%and = and i128 %y, %x
19+
%add = add i128 %lshr, %and
20+
ret i128 %add
21+
}
22+
23+
declare void @use(i8)
24+
25+
define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind {
26+
; CHECK-LABEL: avgflooru_i128_multi_use:
27+
; CHECK: # %bb.0: # %start
28+
; CHECK-NEXT: pushq %rbp
29+
; CHECK-NEXT: pushq %r15
30+
; CHECK-NEXT: pushq %r14
31+
; CHECK-NEXT: pushq %r13
32+
; CHECK-NEXT: pushq %r12
33+
; CHECK-NEXT: pushq %rbx
34+
; CHECK-NEXT: pushq %rax
35+
; CHECK-NEXT: movq %rcx, %rbx
36+
; CHECK-NEXT: movq %rdx, %r14
37+
; CHECK-NEXT: movq %rsi, %r15
38+
; CHECK-NEXT: movq %rdi, %r12
39+
; CHECK-NEXT: movq %rdx, %r13
40+
; CHECK-NEXT: xorq %rdi, %r13
41+
; CHECK-NEXT: movq %rcx, %rbp
42+
; CHECK-NEXT: xorq %rsi, %rbp
43+
; CHECK-NEXT: movq %r13, %rdi
44+
; CHECK-NEXT: movq %rbp, %rsi
45+
; CHECK-NEXT: callq use@PLT
46+
; CHECK-NEXT: shrdq $1, %rbp, %r13
47+
; CHECK-NEXT: shrq %rbp
48+
; CHECK-NEXT: movq %r13, %rdi
49+
; CHECK-NEXT: movq %rbp, %rsi
50+
; CHECK-NEXT: callq use@PLT
51+
; CHECK-NEXT: addq %r14, %r12
52+
; CHECK-NEXT: adcq %rbx, %r15
53+
; CHECK-NEXT: setb %al
54+
; CHECK-NEXT: shrdq $1, %r15, %r12
55+
; CHECK-NEXT: movzbl %al, %edx
56+
; CHECK-NEXT: shldq $63, %r15, %rdx
57+
; CHECK-NEXT: movq %r12, %rax
58+
; CHECK-NEXT: addq $8, %rsp
59+
; CHECK-NEXT: popq %rbx
60+
; CHECK-NEXT: popq %r12
61+
; CHECK-NEXT: popq %r13
62+
; CHECK-NEXT: popq %r14
63+
; CHECK-NEXT: popq %r15
64+
; CHECK-NEXT: popq %rbp
65+
; CHECK-NEXT: retq
66+
start:
67+
%xor = xor i128 %y, %x
68+
call void @use(i128 %xor)
69+
%lshr = lshr i128 %xor, 1
70+
call void @use(i128 %lshr)
71+
%and = and i128 %y, %x
72+
%add = add i128 %lshr, %and
73+
ret i128 %add
74+
}
75+
76+
; This test case shouldn't combine because it's not
77+
; an avgflooru operation
78+
79+
define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
80+
; CHECK-LABEL: avgflooru_i128_negative:
81+
; CHECK: # %bb.0: # %start
82+
; CHECK-NEXT: movq %rdi, %rax
83+
; CHECK-NEXT: andq %rsi, %rcx
84+
; CHECK-NEXT: notq %rsi
85+
; CHECK-NEXT: andq %rdi, %rdx
86+
; CHECK-NEXT: notq %rax
87+
; CHECK-NEXT: addq %rdx, %rax
88+
; CHECK-NEXT: adcq %rcx, %rsi
89+
; CHECK-NEXT: movq %rsi, %rdx
90+
; CHECK-NEXT: retq
91+
start:
92+
%xor = xor i128 %x, -1
93+
%and = and i128 %y, %x
94+
%add = add i128 %xor, %and
95+
ret i128 %add
96+
}
97+
98+
; This negative test case shouldn't combine, i32 is already properly
99+
; handled in terms of legalization, compared to the i128
100+
101+
define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
102+
; CHECK-LABEL: avgflooru_i128_negative2:
103+
; CHECK: # %bb.0: # %start
104+
; CHECK-NEXT: movl %edi, %ecx
105+
; CHECK-NEXT: movl %esi, %eax
106+
; CHECK-NEXT: addq %rcx, %rax
107+
; CHECK-NEXT: shrq %rax
108+
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
109+
; CHECK-NEXT: retq
110+
start:
111+
%xor = xor i32 %y, %x
112+
%lshr = lshr i32 %xor, 1
113+
%and = and i32 %y, %x
114+
%add = add i32 %lshr, %and
115+
ret i32 %add
116+
}
117+
118+
define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
119+
; CHECK-LABEL: avgflooru_i128_vec:
120+
; CHECK: # %bb.0: # %start
121+
; CHECK-NEXT: movq %rdi, %rax
122+
; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi
123+
; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
124+
; CHECK-NEXT: setb %dil
125+
; CHECK-NEXT: movzbl %dil, %edi
126+
; CHECK-NEXT: shldq $63, %rdx, %rdi
127+
; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx
128+
; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8
129+
; CHECK-NEXT: setb %r9b
130+
; CHECK-NEXT: movzbl %r9b, %r9d
131+
; CHECK-NEXT: shldq $63, %r8, %r9
132+
; CHECK-NEXT: shldq $63, %rsi, %rdx
133+
; CHECK-NEXT: shldq $63, %rcx, %r8
134+
; CHECK-NEXT: movq %r8, 16(%rax)
135+
; CHECK-NEXT: movq %rdx, (%rax)
136+
; CHECK-NEXT: movq %r9, 24(%rax)
137+
; CHECK-NEXT: movq %rdi, 8(%rax)
138+
; CHECK-NEXT: retq
139+
start:
140+
%xor = xor <2 x i128> %y, %x
141+
%lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
142+
%and = and <2 x i128> %y, %x
143+
%add = add <2 x i128> %lshr, %and
144+
ret <2 x i128> %add
145+
}

0 commit comments

Comments
 (0)