-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Optimize (slli (srli (slli X, C1), C1), C2) -> (srli (slli X, C1), C1-C2) #119567
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Piotr Fusik (pfusik) ChangesPatch is 72.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119567.diff 6 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index c5432619a36462..4490e1b4c035cd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1026,13 +1026,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
unsigned ShAmt = N1C->getZExtValue();
uint64_t Mask = N0.getConstantOperandVal(1);
- // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C) where C2 has
- // 32 leading zeros and C3 trailing zeros.
if (ShAmt <= 32 && isShiftedMask_64(Mask)) {
unsigned XLen = Subtarget->getXLen();
unsigned LeadingZeros = XLen - llvm::bit_width(Mask);
unsigned TrailingZeros = llvm::countr_zero(Mask);
if (TrailingZeros > 0 && LeadingZeros == 32) {
+ // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C)
+ // where C2 has 32 leading zeros and C3 trailing zeros.
SDNode *SRLIW = CurDAG->getMachineNode(
RISCV::SRLIW, DL, VT, N0->getOperand(0),
CurDAG->getTargetConstant(TrailingZeros, DL, VT));
@@ -1042,6 +1042,22 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, SLLI);
return;
}
+ else if (TrailingZeros == 0 && LeadingZeros > ShAmt &&
+ XLen - LeadingZeros > 11 && LeadingZeros != 32) {
+ // Optimize (shl (and X, C2), C) -> (srli (slli X, C4), C4-C)
+ // where C2 has C4 leading zeros and no trailing zeros.
+ // This is profitable if the "and" was to be lowered to
+ // (srli (slli X, C4), C4) and not (andi X, C2).
+ // For "LeadingZeros == 32" we prefer Zba (slli.uw X, C).
+ SDNode *SLLI = CurDAG->getMachineNode(
+ RISCV::SLLI, DL, VT, N0->getOperand(0),
+ CurDAG->getTargetConstant(LeadingZeros, DL, VT));
+ SDNode *SRLI = CurDAG->getMachineNode(
+ RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+ CurDAG->getTargetConstant(LeadingZeros - ShAmt, DL, VT));
+ ReplaceNode(Node, SRLI);
+ return;
+ }
}
break;
}
diff --git a/llvm/test/CodeGen/RISCV/and-shl.ll b/llvm/test/CodeGen/RISCV/and-shl.ll
new file mode 100644
index 00000000000000..754df62fb4307e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/and-shl.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV64I
+
+define i32 @and_0xfff_shl_2(i32 %x) {
+; RV32I-LABEL: and_0xfff_shl_2:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a0, a0, 20
+; RV32I-NEXT: srli a0, a0, 18
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: and_0xfff_shl_2:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a0, a0, 52
+; RV64I-NEXT: srli a0, a0, 50
+; RV64I-NEXT: ret
+ %a = and i32 %x, 4095
+ %s = shl i32 %a, 2
+ ret i32 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index 7f4483a8f77d9c..ddcb3c3121bc3d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -124,42 +124,40 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
; ZVFH32: # %bb.0:
; ZVFH32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFH32-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFH32-NEXT: lui a1, 8
; ZVFH32-NEXT: vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT: vmv.x.s a2, v9
-; ZVFH32-NEXT: addi a1, a1, -1
+; ZVFH32-NEXT: vmv.x.s a1, v9
; ZVFH32-NEXT: vslidedown.vi v9, v9, 1
-; ZVFH32-NEXT: vmv.x.s a3, v8
-; ZVFH32-NEXT: and a2, a2, a1
-; ZVFH32-NEXT: vmv.x.s a4, v9
-; ZVFH32-NEXT: and a1, a4, a1
-; ZVFH32-NEXT: slli a4, a3, 17
-; ZVFH32-NEXT: slli a3, a3, 30
-; ZVFH32-NEXT: srli a4, a4, 19
-; ZVFH32-NEXT: slli a1, a1, 15
-; ZVFH32-NEXT: or a2, a2, a3
-; ZVFH32-NEXT: or a1, a2, a1
+; ZVFH32-NEXT: vmv.x.s a2, v8
+; ZVFH32-NEXT: slli a1, a1, 17
+; ZVFH32-NEXT: srli a1, a1, 17
+; ZVFH32-NEXT: slli a3, a2, 30
+; ZVFH32-NEXT: or a1, a1, a3
+; ZVFH32-NEXT: vmv.x.s a3, v9
+; ZVFH32-NEXT: slli a2, a2, 17
+; ZVFH32-NEXT: slli a3, a3, 17
+; ZVFH32-NEXT: srli a2, a2, 19
+; ZVFH32-NEXT: srli a3, a3, 2
+; ZVFH32-NEXT: or a1, a1, a3
; ZVFH32-NEXT: sw a1, 0(a0)
-; ZVFH32-NEXT: sh a4, 4(a0)
+; ZVFH32-NEXT: sh a2, 4(a0)
; ZVFH32-NEXT: ret
;
; ZVFH64-LABEL: fp2si_v3f32_v3i15:
; ZVFH64: # %bb.0:
; ZVFH64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFH64-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT: lui a1, 8
-; ZVFH64-NEXT: vmv.x.s a2, v9
-; ZVFH64-NEXT: addiw a1, a1, -1
+; ZVFH64-NEXT: vmv.x.s a1, v9
; ZVFH64-NEXT: vslidedown.vi v8, v9, 1
; ZVFH64-NEXT: vslidedown.vi v9, v9, 2
-; ZVFH64-NEXT: and a2, a2, a1
-; ZVFH64-NEXT: vmv.x.s a3, v8
-; ZVFH64-NEXT: and a1, a3, a1
+; ZVFH64-NEXT: slli a1, a1, 49
+; ZVFH64-NEXT: vmv.x.s a2, v8
; ZVFH64-NEXT: vmv.x.s a3, v9
+; ZVFH64-NEXT: srli a1, a1, 49
+; ZVFH64-NEXT: slli a2, a2, 49
; ZVFH64-NEXT: slli a3, a3, 30
-; ZVFH64-NEXT: slli a1, a1, 15
-; ZVFH64-NEXT: or a2, a2, a3
-; ZVFH64-NEXT: or a1, a2, a1
+; ZVFH64-NEXT: srli a2, a2, 34
+; ZVFH64-NEXT: or a1, a1, a3
+; ZVFH64-NEXT: or a1, a1, a2
; ZVFH64-NEXT: slli a2, a1, 19
; ZVFH64-NEXT: srli a2, a2, 51
; ZVFH64-NEXT: sw a1, 0(a0)
@@ -170,42 +168,40 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
; ZVFHMIN32: # %bb.0:
; ZVFHMIN32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFHMIN32-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN32-NEXT: lui a1, 8
; ZVFHMIN32-NEXT: vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT: vmv.x.s a2, v9
-; ZVFHMIN32-NEXT: addi a1, a1, -1
+; ZVFHMIN32-NEXT: vmv.x.s a1, v9
; ZVFHMIN32-NEXT: vslidedown.vi v9, v9, 1
-; ZVFHMIN32-NEXT: vmv.x.s a3, v8
-; ZVFHMIN32-NEXT: and a2, a2, a1
-; ZVFHMIN32-NEXT: vmv.x.s a4, v9
-; ZVFHMIN32-NEXT: and a1, a4, a1
-; ZVFHMIN32-NEXT: slli a4, a3, 17
-; ZVFHMIN32-NEXT: slli a3, a3, 30
-; ZVFHMIN32-NEXT: srli a4, a4, 19
-; ZVFHMIN32-NEXT: slli a1, a1, 15
-; ZVFHMIN32-NEXT: or a2, a2, a3
-; ZVFHMIN32-NEXT: or a1, a2, a1
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: slli a1, a1, 17
+; ZVFHMIN32-NEXT: srli a1, a1, 17
+; ZVFHMIN32-NEXT: slli a3, a2, 30
+; ZVFHMIN32-NEXT: or a1, a1, a3
+; ZVFHMIN32-NEXT: vmv.x.s a3, v9
+; ZVFHMIN32-NEXT: slli a2, a2, 17
+; ZVFHMIN32-NEXT: slli a3, a3, 17
+; ZVFHMIN32-NEXT: srli a2, a2, 19
+; ZVFHMIN32-NEXT: srli a3, a3, 2
+; ZVFHMIN32-NEXT: or a1, a1, a3
; ZVFHMIN32-NEXT: sw a1, 0(a0)
-; ZVFHMIN32-NEXT: sh a4, 4(a0)
+; ZVFHMIN32-NEXT: sh a2, 4(a0)
; ZVFHMIN32-NEXT: ret
;
; ZVFHMIN64-LABEL: fp2si_v3f32_v3i15:
; ZVFHMIN64: # %bb.0:
; ZVFHMIN64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFHMIN64-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT: lui a1, 8
-; ZVFHMIN64-NEXT: vmv.x.s a2, v9
-; ZVFHMIN64-NEXT: addiw a1, a1, -1
+; ZVFHMIN64-NEXT: vmv.x.s a1, v9
; ZVFHMIN64-NEXT: vslidedown.vi v8, v9, 1
; ZVFHMIN64-NEXT: vslidedown.vi v9, v9, 2
-; ZVFHMIN64-NEXT: and a2, a2, a1
-; ZVFHMIN64-NEXT: vmv.x.s a3, v8
-; ZVFHMIN64-NEXT: and a1, a3, a1
+; ZVFHMIN64-NEXT: slli a1, a1, 49
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: vmv.x.s a3, v9
+; ZVFHMIN64-NEXT: srli a1, a1, 49
+; ZVFHMIN64-NEXT: slli a2, a2, 49
; ZVFHMIN64-NEXT: slli a3, a3, 30
-; ZVFHMIN64-NEXT: slli a1, a1, 15
-; ZVFHMIN64-NEXT: or a2, a2, a3
-; ZVFHMIN64-NEXT: or a1, a2, a1
+; ZVFHMIN64-NEXT: srli a2, a2, 34
+; ZVFHMIN64-NEXT: or a1, a1, a3
+; ZVFHMIN64-NEXT: or a1, a1, a2
; ZVFHMIN64-NEXT: slli a2, a1, 19
; ZVFHMIN64-NEXT: srli a2, a2, 51
; ZVFHMIN64-NEXT: sw a1, 0(a0)
@@ -221,42 +217,40 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
; ZVFH32: # %bb.0:
; ZVFH32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFH32-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFH32-NEXT: lui a1, 16
; ZVFH32-NEXT: vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT: vmv.x.s a2, v9
-; ZVFH32-NEXT: addi a1, a1, -1
+; ZVFH32-NEXT: vmv.x.s a1, v9
; ZVFH32-NEXT: vslidedown.vi v9, v9, 1
-; ZVFH32-NEXT: vmv.x.s a3, v8
-; ZVFH32-NEXT: and a2, a2, a1
-; ZVFH32-NEXT: vmv.x.s a4, v9
-; ZVFH32-NEXT: and a1, a4, a1
-; ZVFH32-NEXT: slli a4, a3, 17
-; ZVFH32-NEXT: slli a3, a3, 30
-; ZVFH32-NEXT: srli a4, a4, 19
-; ZVFH32-NEXT: slli a1, a1, 15
-; ZVFH32-NEXT: or a2, a2, a3
-; ZVFH32-NEXT: or a1, a2, a1
+; ZVFH32-NEXT: vmv.x.s a2, v8
+; ZVFH32-NEXT: slli a1, a1, 16
+; ZVFH32-NEXT: srli a1, a1, 16
+; ZVFH32-NEXT: slli a3, a2, 30
+; ZVFH32-NEXT: or a1, a1, a3
+; ZVFH32-NEXT: vmv.x.s a3, v9
+; ZVFH32-NEXT: slli a2, a2, 17
+; ZVFH32-NEXT: slli a3, a3, 16
+; ZVFH32-NEXT: srli a2, a2, 19
+; ZVFH32-NEXT: srli a3, a3, 1
+; ZVFH32-NEXT: or a1, a1, a3
; ZVFH32-NEXT: sw a1, 0(a0)
-; ZVFH32-NEXT: sh a4, 4(a0)
+; ZVFH32-NEXT: sh a2, 4(a0)
; ZVFH32-NEXT: ret
;
; ZVFH64-LABEL: fp2ui_v3f32_v3i15:
; ZVFH64: # %bb.0:
; ZVFH64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFH64-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT: lui a1, 16
-; ZVFH64-NEXT: vmv.x.s a2, v9
-; ZVFH64-NEXT: addiw a1, a1, -1
+; ZVFH64-NEXT: vmv.x.s a1, v9
; ZVFH64-NEXT: vslidedown.vi v8, v9, 1
; ZVFH64-NEXT: vslidedown.vi v9, v9, 2
-; ZVFH64-NEXT: and a2, a2, a1
-; ZVFH64-NEXT: vmv.x.s a3, v8
-; ZVFH64-NEXT: and a1, a3, a1
+; ZVFH64-NEXT: slli a1, a1, 48
+; ZVFH64-NEXT: vmv.x.s a2, v8
; ZVFH64-NEXT: vmv.x.s a3, v9
+; ZVFH64-NEXT: srli a1, a1, 48
+; ZVFH64-NEXT: slli a2, a2, 48
; ZVFH64-NEXT: slli a3, a3, 30
-; ZVFH64-NEXT: slli a1, a1, 15
-; ZVFH64-NEXT: or a2, a2, a3
-; ZVFH64-NEXT: or a1, a2, a1
+; ZVFH64-NEXT: srli a2, a2, 33
+; ZVFH64-NEXT: or a1, a1, a3
+; ZVFH64-NEXT: or a1, a1, a2
; ZVFH64-NEXT: slli a2, a1, 19
; ZVFH64-NEXT: srli a2, a2, 51
; ZVFH64-NEXT: sw a1, 0(a0)
@@ -267,42 +261,40 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
; ZVFHMIN32: # %bb.0:
; ZVFHMIN32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFHMIN32-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN32-NEXT: lui a1, 16
; ZVFHMIN32-NEXT: vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT: vmv.x.s a2, v9
-; ZVFHMIN32-NEXT: addi a1, a1, -1
+; ZVFHMIN32-NEXT: vmv.x.s a1, v9
; ZVFHMIN32-NEXT: vslidedown.vi v9, v9, 1
-; ZVFHMIN32-NEXT: vmv.x.s a3, v8
-; ZVFHMIN32-NEXT: and a2, a2, a1
-; ZVFHMIN32-NEXT: vmv.x.s a4, v9
-; ZVFHMIN32-NEXT: and a1, a4, a1
-; ZVFHMIN32-NEXT: slli a4, a3, 17
-; ZVFHMIN32-NEXT: slli a3, a3, 30
-; ZVFHMIN32-NEXT: srli a4, a4, 19
-; ZVFHMIN32-NEXT: slli a1, a1, 15
-; ZVFHMIN32-NEXT: or a2, a2, a3
-; ZVFHMIN32-NEXT: or a1, a2, a1
+; ZVFHMIN32-NEXT: vmv.x.s a2, v8
+; ZVFHMIN32-NEXT: slli a1, a1, 16
+; ZVFHMIN32-NEXT: srli a1, a1, 16
+; ZVFHMIN32-NEXT: slli a3, a2, 30
+; ZVFHMIN32-NEXT: or a1, a1, a3
+; ZVFHMIN32-NEXT: vmv.x.s a3, v9
+; ZVFHMIN32-NEXT: slli a2, a2, 17
+; ZVFHMIN32-NEXT: slli a3, a3, 16
+; ZVFHMIN32-NEXT: srli a2, a2, 19
+; ZVFHMIN32-NEXT: srli a3, a3, 1
+; ZVFHMIN32-NEXT: or a1, a1, a3
; ZVFHMIN32-NEXT: sw a1, 0(a0)
-; ZVFHMIN32-NEXT: sh a4, 4(a0)
+; ZVFHMIN32-NEXT: sh a2, 4(a0)
; ZVFHMIN32-NEXT: ret
;
; ZVFHMIN64-LABEL: fp2ui_v3f32_v3i15:
; ZVFHMIN64: # %bb.0:
; ZVFHMIN64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFHMIN64-NEXT: vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT: lui a1, 16
-; ZVFHMIN64-NEXT: vmv.x.s a2, v9
-; ZVFHMIN64-NEXT: addiw a1, a1, -1
+; ZVFHMIN64-NEXT: vmv.x.s a1, v9
; ZVFHMIN64-NEXT: vslidedown.vi v8, v9, 1
; ZVFHMIN64-NEXT: vslidedown.vi v9, v9, 2
-; ZVFHMIN64-NEXT: and a2, a2, a1
-; ZVFHMIN64-NEXT: vmv.x.s a3, v8
-; ZVFHMIN64-NEXT: and a1, a3, a1
+; ZVFHMIN64-NEXT: slli a1, a1, 48
+; ZVFHMIN64-NEXT: vmv.x.s a2, v8
; ZVFHMIN64-NEXT: vmv.x.s a3, v9
+; ZVFHMIN64-NEXT: srli a1, a1, 48
+; ZVFHMIN64-NEXT: slli a2, a2, 48
; ZVFHMIN64-NEXT: slli a3, a3, 30
-; ZVFHMIN64-NEXT: slli a1, a1, 15
-; ZVFHMIN64-NEXT: or a2, a2, a3
-; ZVFHMIN64-NEXT: or a1, a2, a1
+; ZVFHMIN64-NEXT: srli a2, a2, 33
+; ZVFHMIN64-NEXT: or a1, a1, a3
+; ZVFHMIN64-NEXT: or a1, a1, a2
; ZVFHMIN64-NEXT: slli a2, a1, 19
; ZVFHMIN64-NEXT: srli a2, a2, 51
; ZVFHMIN64-NEXT: sw a1, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index e9fd0a19e3eb66..139f7b4e6a0c80 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -3296,11 +3296,11 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) {
; RVA22U64-LABEL: buildvec_v4i16_pack:
; RVA22U64: # %bb.0:
; RVA22U64-NEXT: slli a3, a3, 48
-; RVA22U64-NEXT: zext.h a2, a2
+; RVA22U64-NEXT: slli a2, a2, 48
; RVA22U64-NEXT: zext.h a0, a0
-; RVA22U64-NEXT: zext.h a1, a1
-; RVA22U64-NEXT: slli a2, a2, 32
-; RVA22U64-NEXT: slli a1, a1, 16
+; RVA22U64-NEXT: slli a1, a1, 48
+; RVA22U64-NEXT: srli a2, a2, 16
+; RVA22U64-NEXT: srli a1, a1, 32
; RVA22U64-NEXT: or a2, a2, a3
; RVA22U64-NEXT: or a0, a0, a1
; RVA22U64-NEXT: or a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 141d54cf585f28..c6e12c52122d27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3205,88 +3205,86 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
;
; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i32:
; RV64ZVE32F: # %bb.0:
-; RV64ZVE32F-NEXT: lui a1, 16
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v0
-; RV64ZVE32F-NEXT: andi a3, a2, 1
-; RV64ZVE32F-NEXT: addiw a1, a1, -1
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_2
+; RV64ZVE32F-NEXT: vmv.x.s a1, v0
+; RV64ZVE32F-NEXT: andi a2, a1, 1
+; RV64ZVE32F-NEXT: beqz a2, .LBB40_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: lw a3, 0(a3)
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: slli a2, a2, 48
+; RV64ZVE32F-NEXT: srli a2, a2, 46
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v10, a3
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: .LBB40_2: # %else
-; RV64ZVE32F-NEXT: andi a3, a2, 2
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_4
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB40_4
; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: lw a3, 0(a3)
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 48
+; RV64ZVE32F-NEXT: srli a2, a2, 46
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v9, a3
+; RV64ZVE32F-NEXT: vmv.s.x v9, a2
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1
; RV64ZVE32F-NEXT: .LBB40_4: # %else2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: andi a3, a2, 4
+; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB40_14
+; RV64ZVE32F-NEXT: bnez a2, .LBB40_14
; RV64ZVE32F-NEXT: # %bb.5: # %else5
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: bnez a3, .LBB40_15
+; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: bnez a2, .LBB40_15
; RV64ZVE32F-NEXT: .LBB40_6: # %else8
-; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: bnez a3, .LBB40_16
+; RV64ZVE32F-NEXT: andi a2, a1, 16
+; RV64ZVE32F-NEXT: bnez a2, .LBB40_16
; RV64ZVE32F-NEXT: .LBB40_7: # %else11
-; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_9
+; RV64ZVE32F-NEXT: andi a2, a1, 32
+; RV64ZVE32F-NEXT: beqz a2, .LBB40_9
; RV64ZVE32F-NEXT: .LBB40_8: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: lw a3, 0(a3)
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: slli a2, a2, 48
+; RV64ZVE32F-NEXT: srli a2, a2, 46
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a3
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
; RV64ZVE32F-NEXT: .LBB40_9: # %else14
-; RV64ZVE32F-NEXT: andi a3, a2, 64
+; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_11
+; RV64ZVE32F-NEXT: beqz a2, .LBB40_11
; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: lw a3, 0(a3)
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: slli a2, a2, 48
+; RV64ZVE32F-NEXT: srli a2, a2, 46
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a3
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: .LBB40_11: # %else17
-; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB40_13
+; RV64ZVE32F-NEXT: andi a1, a1, -128
+; RV64ZVE32F-NEXT: beqz a1, .LBB40_13
; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: and a1, a2, a1
-; RV64ZVE32F-NEXT: slli a1, a1, 2
+; RV64ZVE32F-NEXT: vmv.x.s a1, v8
+; RV64ZVE32F-NEXT: slli a1, a1, 48
+; RV64ZVE32F-NEXT: srli a1, a1, 46
; RV64ZVE32F-NEXT: add a0, a0, a1
; RV64ZVE32F-NEXT: lw a0, 0(a0)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
@@ -3298,44 +3296,44 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: lw a3, 0(a3)
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: slli a2, a2, 48
+; RV64ZVE32F-NEXT: srli a2, a2, 46
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a3
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_6
+; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you also fill in the patch description?
// where C2 has C4 leading zeros and no trailing zeros. | ||
// This is profitable if the "and" was to be lowered to | ||
// (srli (slli X, C4), C4) and not (andi X, C2). | ||
// For "LeadingZeros == 32" we prefer Zba (slli.uw X, C). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the absent of Zba, is this pattern still profitable? If that's the case, could we predicate this pattern by the availability of Zba?
Also, could you add a test for this case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is profitable without Zba. I added the and_0xffffffff_shl_2
test.
It turns out that for LeadingZeros == 32
this transform is applied elsewhere. What's the easiest way to check where this happens?
Perhaps it could be implemented for all cases in one place.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's done with this tablegen pattern in RISCVInstrInfo.td
// If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
// shifts instead of 3. This can occur when unsigned is used to index an array.
def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
(SRLI (i64 (SLLI GPR:$rs1, 32)), (ImmSubFrom32 uimm5:$shamt))>;
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I'll add a comment about that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment added. BTW, there's an existing Zba test that covers this case.
@@ -1041,6 +1041,21 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { | |||
CurDAG->getTargetConstant(TrailingZeros + ShAmt, DL, VT)); | |||
ReplaceNode(Node, SLLI); | |||
return; | |||
} else if (TrailingZeros == 0 && LeadingZeros > ShAmt && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else after return.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
// where C2 has C4 leading zeros and no trailing zeros. | ||
// This is profitable if the "and" was to be lowered to | ||
// (srli (slli X, C4), C4) and not (andi X, C2). | ||
// For "LeadingZeros == 32" we prefer Zba (slli.uw X, C). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we be checking hasStdExtZba() for the LeadingZeros != 32?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added the and_0xffffffff_shl_2
test and it looks like we don't because there's some other code that applies this pattern for LeadingZeros == 32
.
How to find out where this happens?
…, C1-C2) Emits better code for: float *index(float *p, int i) { return p + (i & (1 << 30) - 1); }
Also added a negative test (keep |
Don't force push unless you really need to. |
Sorry! |
Oh. That's probably a good reason to force push. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thank you!
Masking out most significant bits can be done with shl followed by srl with
same shift amount. If this is followed by a shl, we could instead srl
by a smaller amount of bits.
This transform is already implemented in tablegen for masking out
32 most significant bits.
Emits better code for e.g.