Skip to content

[PowerPC] Improve pwr7 codegen for v4i8 load #104507

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11490,13 +11490,33 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);

MachineFunction &MF = DAG.getMachineFunction();
SDValue Op0 = Op.getOperand(0);
ReuseLoadInfo RLI;
if (Subtarget.hasLFIWAX() && Subtarget.hasVSX() &&
Op.getValueType() == MVT::v4i32 && Op0.getOpcode() == ISD::LOAD &&
Op0.getValueType() == MVT::i32 && Op0.hasOneUse() &&
canReuseLoadAddress(Op0, MVT::i32, RLI, DAG, ISD::NON_EXTLOAD)) {

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like should be lowered to PPCISD::LD_SPLAT instead of expanding it here. BUILD_VECTOR did the same way.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea.

MachineMemOperand *MMO =
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
SDValue Ops[] = {RLI.Chain, RLI.Ptr, DAG.getValueType(Op.getValueType())};
SDValue Bits = DAG.getMemIntrinsicNode(
PPCISD::LD_SPLAT, dl, DAG.getVTList(MVT::v4i32, MVT::Other), Ops,
MVT::i32, MMO);
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
return Bits.getValue(0);
}

// Create a stack slot that is 16-byte aligned.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

SDValue Val = Op.getOperand(0);
SDValue Val = Op0;
EVT ValVT = Val.getValueType();
// P10 hardware store forwarding requires that a single store contains all
// the data for the load. P10 is able to merge a pair of adjacent stores. Try
Expand Down
119 changes: 44 additions & 75 deletions llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,17 @@ define <2 x i64> @build_v2i64_extload_0(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v2i64_extload_0:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
; PWR7-LE-NEXT: lwz 3, 0(3)
; PWR7-LE-NEXT: stw 4, -16(1)
; PWR7-LE-NEXT: addis 4, 2, .LCPI0_0@toc@ha
; PWR7-LE-NEXT: lfiwzx 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
; PWR7-LE-NEXT: addi 4, 4, .LCPI0_0@toc@l
; PWR7-LE-NEXT: stw 3, -32(1)
; PWR7-LE-NEXT: addi 3, 1, -32
; PWR7-LE-NEXT: lxvd2x 0, 0, 4
; PWR7-LE-NEXT: addi 4, 1, -16
; PWR7-LE-NEXT: lxvd2x 1, 0, 4
; PWR7-LE-NEXT: xxswapd 34, 0
; PWR7-LE-NEXT: xxspltw 35, 0, 1
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: xxswapd 35, 1
; PWR7-LE-NEXT: xxswapd 34, 1
; PWR7-LE-NEXT: xxswapd 36, 0
; PWR7-LE-NEXT: vperm 2, 3, 4, 2
; PWR7-LE-NEXT: vperm 2, 4, 3, 2
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v2i64_extload_0:
Expand Down Expand Up @@ -337,17 +334,13 @@ entry:
define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_0:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: sldi 3, 3, 32
; PWR7-BE-NEXT: std 3, -32(1)
; PWR7-BE-NEXT: std 3, -24(1)
; PWR7-BE-NEXT: lfiwzx 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI8_0@toc@ha
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI8_0@toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -32
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: vperm 2, 3, 4, 2
; PWR7-BE-NEXT: xxspltw 34, 0, 1
; PWR7-BE-NEXT: vperm 2, 2, 4, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_0:
Expand All @@ -365,20 +358,17 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_0:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
; PWR7-LE-NEXT: lwz 3, 0(3)
; PWR7-LE-NEXT: stw 4, -16(1)
; PWR7-LE-NEXT: addis 4, 2, .LCPI8_0@toc@ha
; PWR7-LE-NEXT: lfiwzx 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
; PWR7-LE-NEXT: addi 4, 4, .LCPI8_0@toc@l
; PWR7-LE-NEXT: stw 3, -32(1)
; PWR7-LE-NEXT: addi 3, 1, -32
; PWR7-LE-NEXT: lxvd2x 0, 0, 4
; PWR7-LE-NEXT: addi 4, 1, -16
; PWR7-LE-NEXT: lxvd2x 1, 0, 4
; PWR7-LE-NEXT: xxswapd 34, 0
; PWR7-LE-NEXT: xxspltw 35, 0, 1
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: xxswapd 35, 1
; PWR7-LE-NEXT: xxswapd 34, 1
; PWR7-LE-NEXT: xxswapd 36, 0
; PWR7-LE-NEXT: vperm 2, 3, 4, 2
; PWR7-LE-NEXT: vperm 2, 4, 3, 2
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_0:
Expand All @@ -400,17 +390,13 @@ entry:
define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_1:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: sldi 3, 3, 32
; PWR7-BE-NEXT: std 3, -16(1)
; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: lfiwzx 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI9_0@toc@ha
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI9_0@toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: xxspltw 34, 0, 1
; PWR7-BE-NEXT: vperm 2, 4, 2, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_1:
Expand All @@ -427,20 +413,17 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_1:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
; PWR7-LE-NEXT: lwz 3, 0(3)
; PWR7-LE-NEXT: stw 4, -32(1)
; PWR7-LE-NEXT: stw 4, -16(1)
; PWR7-LE-NEXT: addis 4, 2, .LCPI9_0@toc@ha
; PWR7-LE-NEXT: addi 4, 4, .LCPI9_0@toc@l
; PWR7-LE-NEXT: stw 3, -16(1)
; PWR7-LE-NEXT: lfiwzx 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
; PWR7-LE-NEXT: lxvd2x 0, 0, 4
; PWR7-LE-NEXT: addi 4, 1, -32
; PWR7-LE-NEXT: addi 4, 4, .LCPI9_0@toc@l
; PWR7-LE-NEXT: lxvd2x 1, 0, 4
; PWR7-LE-NEXT: xxswapd 34, 0
; PWR7-LE-NEXT: xxspltw 35, 0, 1
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: xxswapd 35, 1
; PWR7-LE-NEXT: xxswapd 34, 1
; PWR7-LE-NEXT: xxswapd 36, 0
; PWR7-LE-NEXT: vperm 2, 4, 3, 2
; PWR7-LE-NEXT: vperm 2, 3, 4, 2
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_1:
Expand All @@ -463,17 +446,13 @@ entry:
define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_2:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: sldi 3, 3, 32
; PWR7-BE-NEXT: std 3, -16(1)
; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: lfiwzx 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI10_0@toc@ha
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI10_0@toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: xxspltw 34, 0, 1
; PWR7-BE-NEXT: vperm 2, 4, 2, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_2:
Expand All @@ -491,20 +470,17 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_2:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
; PWR7-LE-NEXT: lwz 3, 0(3)
; PWR7-LE-NEXT: stw 4, -32(1)
; PWR7-LE-NEXT: stw 4, -16(1)
; PWR7-LE-NEXT: addis 4, 2, .LCPI10_0@toc@ha
; PWR7-LE-NEXT: addi 4, 4, .LCPI10_0@toc@l
; PWR7-LE-NEXT: stw 3, -16(1)
; PWR7-LE-NEXT: lfiwzx 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
; PWR7-LE-NEXT: lxvd2x 0, 0, 4
; PWR7-LE-NEXT: addi 4, 1, -32
; PWR7-LE-NEXT: addi 4, 4, .LCPI10_0@toc@l
; PWR7-LE-NEXT: lxvd2x 1, 0, 4
; PWR7-LE-NEXT: xxswapd 34, 0
; PWR7-LE-NEXT: xxspltw 35, 0, 1
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: xxswapd 35, 1
; PWR7-LE-NEXT: xxswapd 34, 1
; PWR7-LE-NEXT: xxswapd 36, 0
; PWR7-LE-NEXT: vperm 2, 4, 3, 2
; PWR7-LE-NEXT: vperm 2, 3, 4, 2
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_2:
Expand All @@ -526,17 +502,13 @@ entry:
define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_3:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: sldi 3, 3, 32
; PWR7-BE-NEXT: std 3, -16(1)
; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: lfiwzx 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI11_0@toc@ha
; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI11_0@toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: xxspltw 34, 0, 1
; PWR7-BE-NEXT: vperm 2, 4, 2, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_3:
Expand All @@ -553,20 +525,17 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_3:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
; PWR7-LE-NEXT: lwz 3, 0(3)
; PWR7-LE-NEXT: stw 4, -32(1)
; PWR7-LE-NEXT: stw 4, -16(1)
; PWR7-LE-NEXT: addis 4, 2, .LCPI11_0@toc@ha
; PWR7-LE-NEXT: addi 4, 4, .LCPI11_0@toc@l
; PWR7-LE-NEXT: stw 3, -16(1)
; PWR7-LE-NEXT: lfiwzx 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
; PWR7-LE-NEXT: lxvd2x 0, 0, 4
; PWR7-LE-NEXT: addi 4, 1, -32
; PWR7-LE-NEXT: addi 4, 4, .LCPI11_0@toc@l
; PWR7-LE-NEXT: lxvd2x 1, 0, 4
; PWR7-LE-NEXT: xxswapd 34, 0
; PWR7-LE-NEXT: xxspltw 35, 0, 1
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: xxswapd 35, 1
; PWR7-LE-NEXT: xxswapd 34, 1
; PWR7-LE-NEXT: xxswapd 36, 0
; PWR7-LE-NEXT: vperm 2, 4, 3, 2
; PWR7-LE-NEXT: vperm 2, 3, 4, 2
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_3:
Expand Down
53 changes: 21 additions & 32 deletions llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -536,15 +536,12 @@ define dso_local <8 x i16> @testmrglb3(ptr nocapture readonly %a) local_unnamed_
;
; P8-AIX-32-LABEL: testmrglb3:
; P8-AIX-32: # %bb.0: # %entry
; P8-AIX-32-NEXT: lwz r4, 4(r3)
; P8-AIX-32-NEXT: li r4, 4
; P8-AIX-32-NEXT: lfiwzx f1, 0, r3
; P8-AIX-32-NEXT: xxlxor v3, v3, v3
; P8-AIX-32-NEXT: stw r4, -16(r1)
; P8-AIX-32-NEXT: lwz r3, 0(r3)
; P8-AIX-32-NEXT: stw r3, -32(r1)
; P8-AIX-32-NEXT: addi r3, r1, -16
; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
; P8-AIX-32-NEXT: addi r3, r1, -32
; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3
; P8-AIX-32-NEXT: lfiwzx f0, r3, r4
; P8-AIX-32-NEXT: xxspltw vs1, vs1, 1
; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1
; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0
; P8-AIX-32-NEXT: vmrghb v2, v3, v2
; P8-AIX-32-NEXT: blr
Expand Down Expand Up @@ -852,17 +849,15 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(ptr nocapture rea
;
; P8-AIX-32-LABEL: no_RAUW_in_combine_during_legalize:
; P8-AIX-32: # %bb.0: # %entry
; P8-AIX-32-NEXT: li r5, 0
; P8-AIX-32-NEXT: slwi r4, r4, 2
; P8-AIX-32-NEXT: xxlxor v3, v3, v3
; P8-AIX-32-NEXT: lwzx r3, r3, r4
; P8-AIX-32-NEXT: li r4, 0
; P8-AIX-32-NEXT: stw r4, -32(r1)
; P8-AIX-32-NEXT: stw r3, -16(r1)
; P8-AIX-32-NEXT: addi r3, r1, -32
; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
; P8-AIX-32-NEXT: stw r5, -16(r1)
; P8-AIX-32-NEXT: lfiwzx f0, r3, r4
; P8-AIX-32-NEXT: addi r3, r1, -16
; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3
; P8-AIX-32-NEXT: xxmrghw v2, vs0, vs1
; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1
; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0
; P8-AIX-32-NEXT: vmrghb v2, v2, v3
; P8-AIX-32-NEXT: blr
entry:
Expand Down Expand Up @@ -1026,14 +1021,11 @@ define dso_local <2 x i64> @testSplat8(ptr nocapture readonly %ptr) local_unname
;
; P8-AIX-32-LABEL: testSplat8:
; P8-AIX-32: # %bb.0: # %entry
; P8-AIX-32-NEXT: lwz r4, 4(r3)
; P8-AIX-32-NEXT: stw r4, -16(r1)
; P8-AIX-32-NEXT: lwz r3, 0(r3)
; P8-AIX-32-NEXT: stw r3, -32(r1)
; P8-AIX-32-NEXT: addi r3, r1, -16
; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
; P8-AIX-32-NEXT: addi r3, r1, -32
; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3
; P8-AIX-32-NEXT: li r4, 4
; P8-AIX-32-NEXT: lfiwzx f1, 0, r3
; P8-AIX-32-NEXT: lfiwzx f0, r3, r4
; P8-AIX-32-NEXT: xxspltw vs1, vs1, 1
; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1
; P8-AIX-32-NEXT: xxmrghw vs0, vs1, vs0
; P8-AIX-32-NEXT: xxmrghd v2, vs0, vs0
; P8-AIX-32-NEXT: blr
Expand Down Expand Up @@ -1081,17 +1073,14 @@ define <2 x i64> @testSplati64_0(ptr nocapture readonly %ptr) #0 {
;
; P8-AIX-32-LABEL: testSplati64_0:
; P8-AIX-32: # %bb.0: # %entry
; P8-AIX-32-NEXT: lwz r4, 0(r3)
; P8-AIX-32-NEXT: lwz r3, 4(r3)
; P8-AIX-32-NEXT: stw r3, -16(r1)
; P8-AIX-32-NEXT: li r4, 4
; P8-AIX-32-NEXT: lfiwzx f0, r3, r4
; P8-AIX-32-NEXT: xxspltw v2, vs0, 1
; P8-AIX-32-NEXT: lfiwzx f0, 0, r3
; P8-AIX-32-NEXT: lwz r3, L..C3(r2) # %const.0
; P8-AIX-32-NEXT: stw r4, -32(r1)
; P8-AIX-32-NEXT: lxvw4x v2, 0, r3
; P8-AIX-32-NEXT: addi r3, r1, -16
; P8-AIX-32-NEXT: lxvw4x v3, 0, r3
; P8-AIX-32-NEXT: addi r3, r1, -32
; P8-AIX-32-NEXT: lxvw4x v4, 0, r3
; P8-AIX-32-NEXT: vperm v2, v4, v3, v2
; P8-AIX-32-NEXT: xxspltw v3, vs0, 1
; P8-AIX-32-NEXT: vperm v2, v3, v2, v4
; P8-AIX-32-NEXT: blr
entry:
%0 = load <1 x i64>, ptr %ptr, align 8
Expand Down
Loading
Loading