Skip to content

Commit 370ebc9

Browse files
committed
[DAG] Attempt to fold bswap(shl(x,c)) -> zext(bswap(trunc(shl(x,c-bw/2))))
If the shl is at least half the bitwidth (i.e. the lower half of the bswap source is zero), then we can reduce the shift and perform the bswap at half the bitwidth and just zero extend. Based off PR51391 + PR53867 Differential Revision: https://reviews.llvm.org/D120192
1 parent b3e9fdd commit 370ebc9

File tree

5 files changed

+59
-28
lines changed

5 files changed

+59
-28
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9610,6 +9610,26 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
96109610
return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
96119611
}
96129612

9613+
// fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2))))))
9614+
// iff x >= bw/2 (i.e. lower half is known zero)
9615+
unsigned BW = VT.getScalarSizeInBits();
9616+
if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) {
9617+
auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9618+
EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2);
9619+
if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
9620+
ShAmt->getZExtValue() >= (BW / 2) &&
9621+
(ShAmt->getZExtValue() % 16) == 0 && TLI.isTruncateFree(VT, HalfVT) &&
9622+
(!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) {
9623+
SDValue Res = N0.getOperand(0);
9624+
if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
9625+
Res = DAG.getNode(ISD::SHL, DL, VT, Res,
9626+
DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
9627+
Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
9628+
Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
9629+
return DAG.getZExtOrTrunc(Res, DL, VT);
9630+
}
9631+
}
9632+
96139633
return SDValue();
96149634
}
96159635

llvm/test/CodeGen/AArch64/load-combine-big-endian.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -442,8 +442,8 @@ define i32 @zext_load_i32_by_i8(i32* %arg) {
442442
; CHECK-LABEL: zext_load_i32_by_i8:
443443
; CHECK: // %bb.0:
444444
; CHECK-NEXT: ldrh w8, [x0]
445-
; CHECK-NEXT: lsl w8, w8, #16
446-
; CHECK-NEXT: rev w0, w8
445+
; CHECK-NEXT: rev w8, w8
446+
; CHECK-NEXT: lsr w0, w8, #16
447447
; CHECK-NEXT: ret
448448
%tmp = bitcast i32* %arg to i8*
449449
%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0

llvm/test/CodeGen/AArch64/load-combine.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -499,8 +499,8 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
499499
; CHECK-LABEL: zext_load_i32_by_i8_bswap:
500500
; CHECK: // %bb.0:
501501
; CHECK-NEXT: ldrh w8, [x0]
502-
; CHECK-NEXT: lsl w8, w8, #16
503-
; CHECK-NEXT: rev w0, w8
502+
; CHECK-NEXT: rev w8, w8
503+
; CHECK-NEXT: lsr w0, w8, #16
504504
; CHECK-NEXT: ret
505505

506506
%tmp = bitcast i32* %arg to i8*

llvm/test/CodeGen/X86/combine-bswap.ll

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,16 @@ define void @demand_one_loaded_byte(i64* %xp, i32* %yp) {
8787
define i64 @test_bswap64_shift48_zext(i16 %a0) {
8888
; X86-LABEL: test_bswap64_shift48_zext:
8989
; X86: # %bb.0:
90-
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
91-
; X86-NEXT: shll $16, %eax
92-
; X86-NEXT: bswapl %eax
90+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
91+
; X86-NEXT: rolw $8, %ax
92+
; X86-NEXT: movzwl %ax, %eax
9393
; X86-NEXT: xorl %edx, %edx
9494
; X86-NEXT: retl
9595
;
9696
; X64-LABEL: test_bswap64_shift48_zext:
9797
; X64: # %bb.0:
98-
; X64-NEXT: movl %edi, %eax
99-
; X64-NEXT: shlq $48, %rax
100-
; X64-NEXT: bswapq %rax
98+
; X64-NEXT: rolw $8, %di
99+
; X64-NEXT: movzwl %di, %eax
101100
; X64-NEXT: retq
102101
%z = zext i16 %a0 to i64
103102
%s = shl i64 %z, 48
@@ -109,16 +108,15 @@ define i64 @test_bswap64_shift48(i64 %a0) {
109108
; X86-LABEL: test_bswap64_shift48:
110109
; X86: # %bb.0:
111110
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
112-
; X86-NEXT: shll $16, %eax
113-
; X86-NEXT: bswapl %eax
111+
; X86-NEXT: rolw $8, %ax
112+
; X86-NEXT: movzwl %ax, %eax
114113
; X86-NEXT: xorl %edx, %edx
115114
; X86-NEXT: retl
116115
;
117116
; X64-LABEL: test_bswap64_shift48:
118117
; X64: # %bb.0:
119-
; X64-NEXT: movq %rdi, %rax
120-
; X64-NEXT: shlq $48, %rax
121-
; X64-NEXT: bswapq %rax
118+
; X64-NEXT: rolw $8, %di
119+
; X64-NEXT: movzwl %di, %eax
122120
; X64-NEXT: retq
123121
%s = shl i64 %a0, 48
124122
%b = call i64 @llvm.bswap.i64(i64 %s)

llvm/test/CodeGen/X86/load-combine.ll

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1209,20 +1209,33 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
12091209
; i8* p;
12101210
; (i32) p[1] | ((i32) p[0] << 8)
12111211
define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
1212-
; CHECK-LABEL: zext_load_i32_by_i8_bswap:
1213-
; CHECK: # %bb.0:
1214-
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
1215-
; CHECK-NEXT: movzwl (%eax), %eax
1216-
; CHECK-NEXT: shll $16, %eax
1217-
; CHECK-NEXT: bswapl %eax
1218-
; CHECK-NEXT: retl
1212+
; BSWAP-LABEL: zext_load_i32_by_i8_bswap:
1213+
; BSWAP: # %bb.0:
1214+
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
1215+
; BSWAP-NEXT: movzwl (%eax), %eax
1216+
; BSWAP-NEXT: rolw $8, %ax
1217+
; BSWAP-NEXT: movzwl %ax, %eax
1218+
; BSWAP-NEXT: retl
12191219
;
1220-
; CHECK64-LABEL: zext_load_i32_by_i8_bswap:
1221-
; CHECK64: # %bb.0:
1222-
; CHECK64-NEXT: movzwl (%rdi), %eax
1223-
; CHECK64-NEXT: shll $16, %eax
1224-
; CHECK64-NEXT: bswapl %eax
1225-
; CHECK64-NEXT: retq
1220+
; MOVBE-LABEL: zext_load_i32_by_i8_bswap:
1221+
; MOVBE: # %bb.0:
1222+
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
1223+
; MOVBE-NEXT: movbew (%eax), %ax
1224+
; MOVBE-NEXT: movzwl %ax, %eax
1225+
; MOVBE-NEXT: retl
1226+
;
1227+
; BSWAP64-LABEL: zext_load_i32_by_i8_bswap:
1228+
; BSWAP64: # %bb.0:
1229+
; BSWAP64-NEXT: movzwl (%rdi), %eax
1230+
; BSWAP64-NEXT: rolw $8, %ax
1231+
; BSWAP64-NEXT: movzwl %ax, %eax
1232+
; BSWAP64-NEXT: retq
1233+
;
1234+
; MOVBE64-LABEL: zext_load_i32_by_i8_bswap:
1235+
; MOVBE64: # %bb.0:
1236+
; MOVBE64-NEXT: movbew (%rdi), %ax
1237+
; MOVBE64-NEXT: movzwl %ax, %eax
1238+
; MOVBE64-NEXT: retq
12261239
%tmp = bitcast i32* %arg to i8*
12271240
%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
12281241
%tmp2 = load i8, i8* %tmp1, align 1

0 commit comments

Comments
 (0)