-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[PowerPC] Optimize BUILD_VECTOR from load and zeros #126599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
We are encountered with patterns like `BUILD_VECTOR 0, 0, (load), 0` resulted in suboptimal codegen. This PR improves it. Original Patch by: Kai Luo in llvm#73609
@llvm/pr-subscribers-backend-powerpc Author: Kamau Bridgeman (kamaub) ChangesWe are encountered with patterns like Original Patch by: Kai Luo in #73609 Patch is 25.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126599.diff 3 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 8e400bc63b7851a..783044ef02b56fc 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2431,6 +2431,47 @@ def DblwdCmp {
(v2i64 (XXSPLTW EQWSHAND, 2)), 0));
}
+class SplatAndAssignIndexed<
+ SDPatternOperator op,
+ int Total, dag splat,
+ int Index, dag assign> {
+ defvar head = !listsplat(splat, Index);
+ defvar x = [assign];
+ defvar tail = !listsplat(splat, !sub(!sub(Total, Index), 1));
+ list<dag> Ops = !listconcat(head, x, tail);
+ dag DAG = !foldl((op), Ops, a, b, !con(a, (op b)));
+}
+
+class BVExtLoadAndZerosFP<int Index> : SplatAndAssignIndexed<
+ build_vector,
+ 2, (f64 fpimm0),
+ Index, (f64 (extloadf32 ForceXForm:$src))>;
+
+class BVZExtLoadAndZerosInt<int Index> : SplatAndAssignIndexed<
+ build_vector,
+ 2, (i64 0),
+ Index, (i64 (zextloadi32 ForceXForm:$src))>;
+
+class BVLoadAndZerosInt<int Index> : SplatAndAssignIndexed<
+ build_vector,
+ 4, (i32 0),
+ Index, (i32 (load ForceXForm:$src))>;
+
+class BVLoadAndZerosFP<int Index> : SplatAndAssignIndexed<
+ build_vector,
+ 4, (f32 fpimm0),
+ Index, (f32 (load ForceXForm:$src))>;
+
+class BVLoadAndZerosDbl<int Index> : SplatAndAssignIndexed<
+ build_vector,
+ 2, (f64 fpimm0),
+ Index, (f64 (load ForceXForm:$src))>;
+
+class BVLoadAndZerosLong<int Index> : SplatAndAssignIndexed<
+ build_vector,
+ 2, (i64 0),
+ Index, (i64 (load ForceXForm:$src))>;
+
//---------------------------- Anonymous Patterns ----------------------------//
// Predicate combinations are kept in roughly chronological order in terms of
// instruction availability in the architecture. For example, VSX came in with
@@ -3449,6 +3490,53 @@ foreach Idx = [ [0,3], [2,1], [3,2] ] in {
(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
sub_64), ForceXForm:$src)>;
}
+
+// BUILD_VECTOR via single load and zeros.
+// Extension load.
+def : Pat<(v2f64 BVExtLoadAndZerosFP<0>.DAG),
+ (v2f64 (COPY_TO_REGCLASS (LXSSPX ForceXForm:$src), VSRC))>;
+def : Pat<(v2f64 BVExtLoadAndZerosFP<1>.DAG),
+ (v2f64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSSPX ForceXForm:$src), VSRC), 2))>;
+
+def : Pat<(v2i64 BVZExtLoadAndZerosInt<0>.DAG),
+ (v2i64 (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC))>;
+def : Pat<(v2i64 BVZExtLoadAndZerosInt<1>.DAG),
+ (v2i64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC), 2))>;
+
+// Normal load.
+foreach Index = !range(4) in {
+ defvar Temp = !sub(5, Index);
+ defvar Offset = !if(!gt(Temp, 3), !sub(Temp, 4), Temp);
+ if !ne(Offset, 0) then {
+ def : Pat<(v4i32 BVLoadAndZerosInt<Index>.DAG),
+ (v4i32 (XXSLDWIs
+ (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC),
+ Offset))>;
+ def : Pat<(v4f32 BVLoadAndZerosFP<Index>.DAG),
+ (v4f32 (XXSLDWIs
+ (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC),
+ Offset))>;
+ } else {
+ def : Pat<(v4i32 BVLoadAndZerosInt<Index>.DAG),
+ (v4i32 (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC))>;
+ def : Pat<(v4f32 BVLoadAndZerosFP<Index>.DAG),
+ (v4f32 (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC))>;
+ }
+}
+
+def : Pat<(v2f64 BVLoadAndZerosDbl<0>.DAG),
+ (v2f64 (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC))>;
+def : Pat<(v2f64 BVLoadAndZerosDbl<1>.DAG),
+ (v2f64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC), 2))>;
+
+def : Pat<(v2i64 BVLoadAndZerosLong<0>.DAG),
+ (v2i64 (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC))>;
+def : Pat<(v2i64 BVLoadAndZerosLong<1>.DAG),
+ (v2i64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC), 2))>;
} // HasVSX, HasP8Vector, IsBigEndian, IsPPC64
// Little endian Power8 VSX subtarget.
@@ -3542,6 +3630,54 @@ foreach Idx = [ [0,2], [1,1], [3,3] ] in {
(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
sub_64), ForceXForm:$src)>;
}
+
+// BUILD_VECTOR via single load and zeros.
+// Extension load.
+def : Pat<(v2f64 BVExtLoadAndZerosFP<1>.DAG),
+ (v2f64 (COPY_TO_REGCLASS (LXSSPX ForceXForm:$src), VSRC))>;
+def : Pat<(v2f64 BVExtLoadAndZerosFP<0>.DAG),
+ (v2f64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSSPX ForceXForm:$src), VSRC), 2))>;
+
+def : Pat<(v2i64 BVZExtLoadAndZerosInt<1>.DAG),
+ (v2i64 (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC))>;
+def : Pat<(v2i64 BVZExtLoadAndZerosInt<0>.DAG),
+ (v2i64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC), 2))>;
+
+// Normal load.
+foreach Index = !range(4) in {
+ defvar Temp = !sub(!add(Index, 4), 2);
+ defvar Offset = !if(!gt(Temp, 3), !sub(Temp, 4), Temp);
+ if !ne(Offset, 0) then {
+ def : Pat<(v4i32 BVLoadAndZerosInt<Index>.DAG),
+ (v4i32 (XXSLDWIs
+ (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC),
+ Offset))>;
+ def : Pat<(v4f32 BVLoadAndZerosFP<Index>.DAG),
+ (v4f32 (XXSLDWIs
+ (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC),
+ Offset))>;
+ } else {
+ def : Pat<(v4i32 BVLoadAndZerosInt<Index>.DAG),
+ (v4i32 (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC))>;
+ def : Pat<(v4f32 BVLoadAndZerosFP<Index>.DAG),
+ (v4f32 (COPY_TO_REGCLASS (LXSIWZX ForceXForm:$src), VSRC))>;
+ }
+}
+
+def : Pat<(v2f64 BVLoadAndZerosDbl<1>.DAG),
+ (v2f64 (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC))>;
+def : Pat<(v2f64 BVLoadAndZerosDbl<0>.DAG),
+ (v2f64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC), 2))>;
+
+def : Pat<(v2i64 BVLoadAndZerosLong<1>.DAG),
+ (v2i64 (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC))>;
+def : Pat<(v2i64 BVLoadAndZerosLong<0>.DAG),
+ (v2i64 (XXPERMDIs
+ (COPY_TO_REGCLASS (LXSDX ForceXForm:$src), VSRC), 2))>;
+
} // HasVSX, HasP8Vector, IsLittleEndian
// Big endian pre-Power9 VSX subtarget.
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
index 2259b6e0f44df64..fb67221e7d9f306 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
@@ -17,11 +17,7 @@ define <2 x i64> @build_v2i64_extload_0(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2i64_extload_0:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lwz 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: mtfprd 0, 4
-; PWR8-BE-NEXT: mtfprd 1, 3
-; PWR8-BE-NEXT: xxmrghd 34, 1, 0
+; PWR8-BE-NEXT: lxsiwzx 34, 0, 3
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2i64_extload_0:
@@ -38,13 +34,8 @@ define <2 x i64> @build_v2i64_extload_0(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2i64_extload_0:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lwz 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: rldimi 3, 4, 32, 0
-; PWR8-LE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-LE-NEXT: mtfprd 0, 3
-; PWR8-LE-NEXT: mtfprd 1, 4
-; PWR8-LE-NEXT: xxmrghd 34, 1, 0
+; PWR8-LE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-LE-NEXT: xxsldwi 34, 0, 0, 2
; PWR8-LE-NEXT: blr
entry:
%0 = load i32, ptr %p, align 4
@@ -66,11 +57,8 @@ define <2 x i64> @build_v2i64_extload_1(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2i64_extload_1:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lwz 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: mtfprd 0, 4
-; PWR8-BE-NEXT: mtfprd 1, 3
-; PWR8-BE-NEXT: xxmrghd 34, 0, 1
+; PWR8-BE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-BE-NEXT: xxswapd 34, 0
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2i64_extload_1:
@@ -86,11 +74,7 @@ define <2 x i64> @build_v2i64_extload_1(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2i64_extload_1:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lwz 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: mtfprd 0, 4
-; PWR8-LE-NEXT: mtfprd 1, 3
-; PWR8-LE-NEXT: xxmrghd 34, 1, 0
+; PWR8-LE-NEXT: lxsiwzx 34, 0, 3
; PWR8-LE-NEXT: blr
entry:
%0 = load i32, ptr %p, align 4
@@ -109,9 +93,7 @@ define <2 x double> @build_v2f64_extload_0(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2f64_extload_0:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lfs 0, 0(3)
-; PWR8-BE-NEXT: xxlxor 1, 1, 1
-; PWR8-BE-NEXT: xxmrghd 34, 0, 1
+; PWR8-BE-NEXT: lxsspx 34, 0, 3
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2f64_extload_0:
@@ -123,9 +105,8 @@ define <2 x double> @build_v2f64_extload_0(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2f64_extload_0:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lfs 0, 0(3)
-; PWR8-LE-NEXT: xxlxor 1, 1, 1
-; PWR8-LE-NEXT: xxmrghd 34, 1, 0
+; PWR8-LE-NEXT: lxsspx 0, 0, 3
+; PWR8-LE-NEXT: xxswapd 34, 0
; PWR8-LE-NEXT: blr
entry:
%0 = load float, ptr %p, align 4
@@ -144,9 +125,8 @@ define <2 x double> @build_v2f64_extload_1(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2f64_extload_1:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lfs 0, 0(3)
-; PWR8-BE-NEXT: xxlxor 1, 1, 1
-; PWR8-BE-NEXT: xxmrghd 34, 1, 0
+; PWR8-BE-NEXT: lxsspx 0, 0, 3
+; PWR8-BE-NEXT: xxswapd 34, 0
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2f64_extload_1:
@@ -158,9 +138,7 @@ define <2 x double> @build_v2f64_extload_1(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2f64_extload_1:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lfs 0, 0(3)
-; PWR8-LE-NEXT: xxlxor 1, 1, 1
-; PWR8-LE-NEXT: xxmrghd 34, 0, 1
+; PWR8-LE-NEXT: lxsspx 34, 0, 3
; PWR8-LE-NEXT: blr
entry:
%0 = load float, ptr %p, align 4
@@ -179,9 +157,7 @@ define <2 x double> @build_v2f64_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2f64_load_0:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lfd 0, 0(3)
-; PWR8-BE-NEXT: xxlxor 1, 1, 1
-; PWR8-BE-NEXT: xxmrghd 34, 0, 1
+; PWR8-BE-NEXT: lxsdx 34, 0, 3
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2f64_load_0:
@@ -193,9 +169,8 @@ define <2 x double> @build_v2f64_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2f64_load_0:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lfd 0, 0(3)
-; PWR8-LE-NEXT: xxlxor 1, 1, 1
-; PWR8-LE-NEXT: xxmrghd 34, 1, 0
+; PWR8-LE-NEXT: lxsdx 0, 0, 3
+; PWR8-LE-NEXT: xxswapd 34, 0
; PWR8-LE-NEXT: blr
entry:
%0 = load double, ptr %p, align 8
@@ -213,9 +188,8 @@ define <2 x double> @build_v2f64_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2f64_load_1:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lfd 0, 0(3)
-; PWR8-BE-NEXT: xxlxor 1, 1, 1
-; PWR8-BE-NEXT: xxmrghd 34, 1, 0
+; PWR8-BE-NEXT: lxsdx 0, 0, 3
+; PWR8-BE-NEXT: xxswapd 34, 0
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2f64_load_1:
@@ -227,9 +201,7 @@ define <2 x double> @build_v2f64_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2f64_load_1:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lfd 0, 0(3)
-; PWR8-LE-NEXT: xxlxor 1, 1, 1
-; PWR8-LE-NEXT: xxmrghd 34, 0, 1
+; PWR8-LE-NEXT: lxsdx 34, 0, 3
; PWR8-LE-NEXT: blr
entry:
%0 = load double, ptr %p, align 8
@@ -250,11 +222,7 @@ define <2 x i64> @build_v2i64_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2i64_load_0:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: ld 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: mtfprd 0, 4
-; PWR8-BE-NEXT: mtfprd 1, 3
-; PWR8-BE-NEXT: xxmrghd 34, 1, 0
+; PWR8-BE-NEXT: lxsdx 34, 0, 3
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2i64_load_0:
@@ -270,11 +238,8 @@ define <2 x i64> @build_v2i64_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2i64_load_0:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: ld 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: mtfprd 0, 4
-; PWR8-LE-NEXT: mtfprd 1, 3
-; PWR8-LE-NEXT: xxmrghd 34, 0, 1
+; PWR8-LE-NEXT: lxsdx 0, 0, 3
+; PWR8-LE-NEXT: xxswapd 34, 0
; PWR8-LE-NEXT: blr
entry:
%0 = load i64, ptr %p, align 8
@@ -295,11 +260,8 @@ define <2 x i64> @build_v2i64_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v2i64_load_1:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: ld 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: mtfprd 0, 4
-; PWR8-BE-NEXT: mtfprd 1, 3
-; PWR8-BE-NEXT: xxmrghd 34, 0, 1
+; PWR8-BE-NEXT: lxsdx 0, 0, 3
+; PWR8-BE-NEXT: xxswapd 34, 0
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v2i64_load_1:
@@ -315,11 +277,7 @@ define <2 x i64> @build_v2i64_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v2i64_load_1:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: ld 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: mtfprd 0, 4
-; PWR8-LE-NEXT: mtfprd 1, 3
-; PWR8-LE-NEXT: xxmrghd 34, 1, 0
+; PWR8-LE-NEXT: lxsdx 34, 0, 3
; PWR8-LE-NEXT: blr
entry:
%0 = load i64, ptr %p, align 8
@@ -341,14 +299,8 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v4i32_load_0:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lwz 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: li 5, 0
-; PWR8-BE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-BE-NEXT: rldimi 5, 3, 32, 0
-; PWR8-BE-NEXT: mtfprd 1, 4
-; PWR8-BE-NEXT: mtfprd 0, 5
-; PWR8-BE-NEXT: xxmrghd 34, 0, 1
+; PWR8-BE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-BE-NEXT: xxsldwi 34, 0, 0, 1
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v4i32_load_0:
@@ -365,13 +317,8 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v4i32_load_0:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lwz 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: rldimi 3, 4, 32, 0
-; PWR8-LE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-LE-NEXT: mtfprd 0, 3
-; PWR8-LE-NEXT: mtfprd 1, 4
-; PWR8-LE-NEXT: xxmrghd 34, 1, 0
+; PWR8-LE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-LE-NEXT: xxsldwi 34, 0, 0, 2
; PWR8-LE-NEXT: blr
entry:
%0 = load i32, ptr %p, align 4
@@ -393,13 +340,7 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v4i32_load_1:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lwz 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: rldimi 3, 4, 32, 0
-; PWR8-BE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-BE-NEXT: mtfprd 0, 3
-; PWR8-BE-NEXT: mtfprd 1, 4
-; PWR8-BE-NEXT: xxmrghd 34, 0, 1
+; PWR8-BE-NEXT: lxsiwzx 34, 0, 3
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v4i32_load_1:
@@ -416,14 +357,8 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v4i32_load_1:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lwz 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: li 5, 0
-; PWR8-LE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-LE-NEXT: rldimi 5, 3, 32, 0
-; PWR8-LE-NEXT: mtfprd 1, 4
-; PWR8-LE-NEXT: mtfprd 0, 5
-; PWR8-LE-NEXT: xxmrghd 34, 1, 0
+; PWR8-LE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-LE-NEXT: xxsldwi 34, 0, 0, 3
; PWR8-LE-NEXT: blr
entry:
%0 = load i32, ptr %p, align 4
@@ -445,14 +380,8 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v4i32_load_2:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lwz 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: li 5, 0
-; PWR8-BE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-BE-NEXT: rldimi 5, 3, 32, 0
-; PWR8-BE-NEXT: mtfprd 1, 4
-; PWR8-BE-NEXT: mtfprd 0, 5
-; PWR8-BE-NEXT: xxmrghd 34, 1, 0
+; PWR8-BE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-BE-NEXT: xxsldwi 34, 0, 0, 3
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v4i32_load_2:
@@ -469,13 +398,7 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v4i32_load_2:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lwz 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: rldimi 3, 4, 32, 0
-; PWR8-LE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-LE-NEXT: mtfprd 0, 3
-; PWR8-LE-NEXT: mtfprd 1, 4
-; PWR8-LE-NEXT: xxmrghd 34, 0, 1
+; PWR8-LE-NEXT: lxsiwzx 34, 0, 3
; PWR8-LE-NEXT: blr
entry:
%0 = load i32, ptr %p, align 4
@@ -497,13 +420,8 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v4i32_load_3:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lwz 3, 0(3)
-; PWR8-BE-NEXT: li 4, 0
-; PWR8-BE-NEXT: rldimi 3, 4, 32, 0
-; PWR8-BE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-BE-NEXT: mtfprd 0, 3
-; PWR8-BE-NEXT: mtfprd 1, 4
-; PWR8-BE-NEXT: xxmrghd 34, 1, 0
+; PWR8-BE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-BE-NEXT: xxsldwi 34, 0, 0, 2
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v4i32_load_3:
@@ -520,14 +438,8 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v4i32_load_3:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lwz 3, 0(3)
-; PWR8-LE-NEXT: li 4, 0
-; PWR8-LE-NEXT: li 5, 0
-; PWR8-LE-NEXT: rldimi 4, 4, 32, 0
-; PWR8-LE-NEXT: rldimi 5, 3, 32, 0
-; PWR8-LE-NEXT: mtfprd 1, 4
-; PWR8-LE-NEXT: mtfprd 0, 5
-; PWR8-LE-NEXT: xxmrghd 34, 0, 1
+; PWR8-LE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-LE-NEXT: xxsldwi 34, 0, 0, 1
; PWR8-LE-NEXT: blr
entry:
%0 = load i32, ptr %p, align 4
@@ -554,13 +466,8 @@ define <4 x float> @build_v4f32_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v4f32_load_0:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lfs 0, 0(3)
-; PWR8-BE-NEXT: xxlxor 1, 1, 1
-; PWR8-BE-NEXT: xxmrghd 0, 0, 1
-; PWR8-BE-NEXT: xxspltd 1, 1, 0
-; PWR8-BE-NEXT: xvcvdpsp 34, 0
-; PWR8-BE-NEXT: xvcvdpsp 35, 1
-; PWR8-BE-NEXT: vmrgew 2, 2, 3
+; PWR8-BE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-BE-NEXT: xxsldwi 34, 0, 0, 1
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v4f32_load_0:
@@ -584,13 +491,8 @@ define <4 x float> @build_v4f32_load_0(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v4f32_load_0:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lfs 0, 0(3)
-; PWR8-LE-NEXT: xxlxor 1, 1, 1
-; PWR8-LE-NEXT: xxmrghd 0, 1, 0
-; PWR8-LE-NEXT: xxspltd 1, 1, 0
-; PWR8-LE-NEXT: xvcvdpsp 34, 0
-; PWR8-LE-NEXT: xvcvdpsp 35, 1
-; PWR8-LE-NEXT: vmrgew 2, 3, 2
+; PWR8-LE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-LE-NEXT: xxsldwi 34, 0, 0, 2
; PWR8-LE-NEXT: blr
entry:
%0 = load float, ptr %p, align 4
@@ -617,13 +519,7 @@ define <4 x float> @build_v4f32_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v4f32_load_1:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lfs 0, 0(3)
-; PWR8-BE-NEXT: xxlxor 1, 1, 1
-; PWR8-BE-NEXT: xxmrghd 0, 0, 1
-; PWR8-BE-NEXT: xxspltd 1, 1, 0
-; PWR8-BE-NEXT: xvcvdpsp 34, 0
-; PWR8-BE-NEXT: xvcvdpsp 35, 1
-; PWR8-BE-NEXT: vmrgew 2, 3, 2
+; PWR8-BE-NEXT: lxsiwzx 34, 0, 3
; PWR8-BE-NEXT: blr
;
; PWR7-LE-LABEL: build_v4f32_load_1:
@@ -647,13 +543,8 @@ define <4 x float> @build_v4f32_load_1(ptr nocapture noundef readonly %p) {
;
; PWR8-LE-LABEL: build_v4f32_load_1:
; PWR8-LE: # %bb.0: # %entry
-; PWR8-LE-NEXT: lfs 0, 0(3)
-; PWR8-LE-NEXT: xxlxor 1, 1, 1
-; PWR8-LE-NEXT: xxmrghd 0, 1, 0
-; PWR8-LE-NEXT: xxspltd 1, 1, 0
-; PWR8-LE-NEXT: xvcvdpsp 34, 0
-; PWR8-LE-NEXT: xvcvdpsp 35, 1
-; PWR8-LE-NEXT: vmrgew 2, 2, 3
+; PWR8-LE-NEXT: lxsiwzx 0, 0, 3
+; PWR8-LE-NEXT: xxsldwi 34, 0, 0, 3
; PWR8-LE-NEXT: blr
entry:
%0 = load float, ptr %p, align 4
@@ -680,13 +571,8 @@ define <4 x float> @build_v4f32_load_2(ptr nocapture noundef readonly %p) {
;
; PWR8-BE-LABEL: build_v4f32_load_2:
; PWR8-BE: # %bb.0: # %entry
-; PWR8-BE-NEXT: lfs 0, 0(3)
-; PW...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM. Just added the original author and reviewers to ensure their comments have been addressed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pattern seems somewhat obscure, but it clearly appears in some important code if it warrants a patch like this.
Overall, I think this is a nice patch, but I wonder if it could be much simpler if we address this in a DAG combine and add a custom PPCISD node.
What I mean is that we add a node for example PPCISD::VEC_INS_VAL_IN_ZEROS val, idx
where we would detect that this pattern inserts a single value into a vector while zeroing out the remaining values. This would simplify the matching code in the td file and make it more obvious if there are any holes. For example, this can probably improve even for situations where the val
is already in a register - see https://godbolt.org/z/PP4WY6996
My code generation suggestions there certainly seem simpler, but TBH, I haven't looked at the UM to see if they improve latency.
P.S. The definition and uses of SplatAndAssignIndexed
are nice demonstrations of your tblgen-fu! :)
I agree with @nemanjai , the patterns might be difficult to maintain in the future. A intuitive solution is preferred. |
We are encountered with patterns like
BUILD_VECTOR 0, 0, (load), 0
resulted in suboptimal codegen. This PR improves it.
Original Patch by: Kai Luo in #73609