Skip to content

[LoongArch] Lower [x]vshuf.d to [x]vshuf4i.d if possible. #137918

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 30 additions & 20 deletions llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1026,46 +1026,54 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
SelectionDAG &DAG) {

// When the size is less than 4, lower cost instructions may be used.
if (Mask.size() < 4)
return SDValue();
unsigned SubVecSize = 4;
if (VT == MVT::v2f64 || VT == MVT::v2i64 || VT == MVT::v4f64 ||
VT == MVT::v4i64) {
SubVecSize = 2;
}

int SubMask[4] = {-1, -1, -1, -1};
for (unsigned i = 0; i < 4; ++i) {
for (unsigned j = i; j < Mask.size(); j += 4) {
int Idx = Mask[j];
for (unsigned i = 0; i < SubVecSize; ++i) {
for (unsigned j = i; j < Mask.size(); j += SubVecSize) {
int M = Mask[j];

// Convert from vector index to 4-element subvector index
// If an index refers to an element outside of the subvector then give up
if (Idx != -1) {
Idx -= 4 * (j / 4);
if (Idx < 0 || Idx >= 4)
if (M != -1) {
M -= 4 * (j / SubVecSize);
if (M < 0 || M >= 4)
return SDValue();
}

// If the mask has an undef, replace it with the current index.
// Note that it might still be undef if the current index is also undef
if (SubMask[i] == -1)
SubMask[i] = Idx;
SubMask[i] = M;
// Check that non-undef values are the same as in the mask. If they
// aren't then give up
else if (Idx != -1 && Idx != SubMask[i])
else if (M != -1 && M != SubMask[i])
return SDValue();
}
}

// Calculate the immediate. Replace any remaining undefs with zero
APInt Imm(64, 0);
for (int i = 3; i >= 0; --i) {
int Idx = SubMask[i];
for (int i = SubVecSize - 1; i >= 0; --i) {
int M = SubMask[i];

if (Idx == -1)
Idx = 0;
if (M == -1)
M = 0;

Imm <<= 2;
Imm |= Idx & 0x3;
Imm |= M & 0x3;
}

// Return vshuf4i.d and xvshuf4i.d
if (VT == MVT::v2f64 || VT == MVT::v2i64 || VT == MVT::v4f64 ||
VT == MVT::v4i64)
return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, V2,
DAG.getConstant(Imm, DL, MVT::i64));

return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
DAG.getConstant(Imm, DL, MVT::i64));
}
Expand Down Expand Up @@ -1389,6 +1397,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
return Result;
if ((VT.SimpleTy == MVT::v2i64 || VT.SimpleTy == MVT::v2f64) &&
(Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
return Result;
if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG,
Zeroable)))
return Result;
Expand Down Expand Up @@ -1447,10 +1458,6 @@ static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
SelectionDAG &DAG) {
// When the size is less than or equal to 4, lower cost instructions may be
// used.
if (Mask.size() <= 4)
return SDValue();
return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
}

Expand Down Expand Up @@ -1832,6 +1839,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
return Result;
if ((VT.SimpleTy == MVT::v4i64 || VT.SimpleTy == MVT::v4f64) &&
(Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
return Result;
if ((Result =
lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, Zeroable)))
return Result;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1729,6 +1729,10 @@ def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8),
(XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8),
(XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i_d v4i64:$xj, v4i64:$xk, immZExt8:$ui8),
(XVSHUF4I_D v4i64:$xj, v4i64:$xk, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i_d v4f64:$xj, v4f64:$xk, immZExt8:$ui8),
(XVSHUF4I_D v4f64:$xj, v4f64:$xk, immZExt8:$ui8)>;

// XVREPL128VEI_{B/H/W/D}
def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4),
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
def SDT_LoongArchVShuf4i_D
: SDTypeProfile<1, 3,
[SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i64>]>;
def SDT_LoongArchVreplgr2vr : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>]>;
def SDT_LoongArchVFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
def SDT_LoongArchVFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
Expand Down Expand Up @@ -53,6 +57,8 @@ def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>;
def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;

def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchV1RUimm>;
def loongarch_vshuf4i_d
: SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchVShuf4i_D>;
def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_LoongArchV1RUimm>;
def loongarch_vreplgr2vr: SDNode<"LoongArchISD::VREPLGR2VR", SDT_LoongArchVreplgr2vr>;

Expand Down Expand Up @@ -1914,6 +1920,10 @@ def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
(VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
(VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i_d v2i64:$vj, v2i64:$vk, immZExt8:$ui8),
(VSHUF4I_D v2i64:$vj, v2i64:$vk, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i_d v2f64:$vj, v2f64:$vk, immZExt8:$ui8),
(VSHUF4I_D v2f64:$vj, v2f64:$vk, immZExt8:$ui8)>;

// VREPLVEI_{B/H/W/D}
def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,23 @@ define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b)
%c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %c
}

;; xvshuf4i.d
define <4 x i64> @shufflevector_xvshuf4i_v4d64(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: shufflevector_xvshuf4i_v4d64:
; CHECK: # %bb.0:
; CHECK-NEXT: xvshuf4i.d $xr0, $xr1, 9
; CHECK-NEXT: ret
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
ret <4 x i64> %c
}

;; xvshuf4i.d
define <4 x double> @shufflevector_xvshuf4i_v4f64(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: shufflevector_xvshuf4i_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: xvshuf4i.d $xr0, $xr1, 9
; CHECK-NEXT: ret
%c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
ret <4 x double> %c
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,7 @@ define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: shufflevector_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI3_0)
; CHECK-NEXT: vshuf.d $vr2, $vr1, $vr0
; CHECK-NEXT: vori.b $vr0, $vr2, 0
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 12
; CHECK-NEXT: ret
%c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %c
Expand All @@ -68,10 +65,7 @@ define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: shufflevector_v2f64:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0)
; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI5_0)
; CHECK-NEXT: vshuf.d $vr2, $vr1, $vr0
; CHECK-NEXT: vori.b $vr0, $vr2, 0
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 12
; CHECK-NEXT: ret
%c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %c
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s

;; vilvh.b
;; vshuf4i.b
define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: shufflevector_vshuf4i_v16i8:
; CHECK: # %bb.0:
Expand All @@ -11,7 +11,7 @@ define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
ret <16 x i8> %c
}

;; vilvh.h
;; vshuf4i.h
define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: shufflevector_vshuf4i_v8i4:
; CHECK: # %bb.0:
Expand All @@ -21,7 +21,7 @@ define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
ret <8 x i16> %c
}

;; vilvh.w
;; vshuf4i.w
define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: shufflevector_vshuf4i_v4i32:
; CHECK: # %bb.0:
Expand All @@ -31,7 +31,7 @@ define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
ret <4 x i32> %c
}

;; vilvh.w
;; vshuf4i.w
define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: shufflevector_vshuf4i_v4f32:
; CHECK: # %bb.0:
Expand All @@ -40,3 +40,23 @@ define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b)
%c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %c
}

;; vshuf4i.d
define <2 x i64> @shufflevector_vshuf4i_v2d64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: shufflevector_vshuf4i_v2d64:
; CHECK: # %bb.0:
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 9
; CHECK-NEXT: ret
%c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i64> %c
}

;; vshuf4i.d
define <2 x double> @shufflevector_vshuf4i_v2f64(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: shufflevector_vshuf4i_v2f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 9
; CHECK-NEXT: ret
%c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
ret <2 x double> %c
}
12 changes: 3 additions & 9 deletions llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ define <4 x i32> @byte_rotate_v4i32_3(<4 x i32> %a) nounwind {
define <2 x i64> @byte_rotate_v2i64_1(<2 x i64> %a, <2 x i64> %b) nounwind {
; CHECK-LABEL: byte_rotate_v2i64_1:
; CHECK: # %bb.0:
; CHECK-NEXT: vbsrl.v $vr1, $vr1, 8
; CHECK-NEXT: vbsll.v $vr0, $vr0, 8
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 3
; CHECK-NEXT: ret
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
ret <2 x i64> %shuffle
Expand All @@ -114,9 +112,7 @@ define <2 x i64> @byte_rotate_v2i64_1(<2 x i64> %a, <2 x i64> %b) nounwind {
define <2 x i64> @byte_rotate_v2i64_2(<2 x i64> %a, <2 x i64> %b) nounwind {
; CHECK-LABEL: byte_rotate_v2i64_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vbsrl.v $vr0, $vr0, 8
; CHECK-NEXT: vbsll.v $vr1, $vr1, 8
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 9
; CHECK-NEXT: ret
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i64> %shuffle
Expand All @@ -125,9 +121,7 @@ define <2 x i64> @byte_rotate_v2i64_2(<2 x i64> %a, <2 x i64> %b) nounwind {
define <2 x i64> @byte_rotate_v2i64_3(<2 x i64> %a) nounwind {
; CHECK-LABEL: byte_rotate_v2i64_3:
; CHECK: # %bb.0:
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
; CHECK-NEXT: vbsll.v $vr0, $vr0, 8
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
; CHECK-NEXT: vshuf4i.d $vr0, $vr0, 1
; CHECK-NEXT: ret
%shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
ret <2 x i64> %shuffle
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,8 @@ define <4 x i32> @shuffle_4i32_vbsrl_v_12(<4 x i32> %a) nounwind {
define <2 x i64> @shuffle_2i64_vbsrl_v_8(<2 x i64> %a) nounwind {
; CHECK-LABEL: shuffle_2i64_vbsrl_v_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vbsrl.v $vr0, $vr0, 8
; CHECK-NEXT: vrepli.b $vr1, 0
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 9
; CHECK-NEXT: ret
%shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 2>
ret <2 x i64> %shuffle
Expand Down
15 changes: 3 additions & 12 deletions llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,7 @@ define <16 x i8> @widen_shuffle_mask_v16i8_to_v4i32(<16 x i8> %a, <16 x i8> %b)
define <16 x i8> @widen_shuffle_mask_v16i8_to_v2i64(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: widen_shuffle_mask_v16i8_to_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI2_0)
; CHECK-NEXT: vshuf.d $vr2, $vr1, $vr0
; CHECK-NEXT: vori.b $vr0, $vr2, 0
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 12
; CHECK-NEXT: ret
%r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <16 x i8> %r
Expand All @@ -52,10 +49,7 @@ define <8 x i16> @widen_shuffle_mask_v8i16_to_v4i32(<8 x i16> %a, <8 x i16> %b)
define <8 x i16> @widen_shuffle_mask_v8i16_to_v2i64(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: widen_shuffle_mask_v8i16_to_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI4_0)
; CHECK-NEXT: vshuf.d $vr2, $vr1, $vr0
; CHECK-NEXT: vori.b $vr0, $vr2, 0
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 12
; CHECK-NEXT: ret
%r = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
ret <8 x i16> %r
Expand All @@ -64,10 +58,7 @@ define <8 x i16> @widen_shuffle_mask_v8i16_to_v2i64(<8 x i16> %a, <8 x i16> %b)
define <4 x i32> @widen_shuffle_mask_v4i32_to_v2i64(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: widen_shuffle_mask_v4i32_to_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0)
; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI5_0)
; CHECK-NEXT: vshuf.d $vr2, $vr1, $vr0
; CHECK-NEXT: vori.b $vr0, $vr2, 0
; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 12
; CHECK-NEXT: ret
%r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
ret <4 x i32> %r
Expand Down
Loading