-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Relax vbroadcast(vector load X) -> vbroadcast_load(X) to all types #128039
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
There's no need for a AVX1-only 32/64-bit scalar size limit - if the X86ISD::VBROADCAST node type is supported, X86ISD::VBROADCAST_LOAD will be as well.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThere's no need for a AVX1-only 32/64-bit scalar size limit - if the X86ISD::VBROADCAST node type is supported, X86ISD::VBROADCAST_LOAD will be as well. Patch is 33.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128039.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1c9d43ce4c062..22e4d3fb64fd0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42087,9 +42087,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
}
// vbroadcast(vector load X) -> vbroadcast_load
- if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
- SrcVT == MVT::v4i32) &&
- Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
+ if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
// Unless the load is volatile or atomic.
if (LN->isSimple()) {
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index c0afc0cfe2c0a..d617cfb6aedee 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -910,8 +910,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -920,10 +920,10 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
;
; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
@@ -931,10 +931,10 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
;
; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
@@ -942,10 +942,10 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
;
; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
@@ -1906,12 +1906,10 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -1921,12 +1919,10 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -1936,7 +1932,6 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2133,13 +2128,11 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
-; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512F-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -2147,23 +2140,21 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2364,13 +2355,11 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
-; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -2378,23 +2367,21 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2817,7 +2804,6 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2830,7 +2816,6 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2907,8 +2892,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2],xmm1[3],mem[4,5],xmm1[6],mem[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2920,26 +2904,24 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
-; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -3024,7 +3006,6 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3037,7 +3018,6 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3117,8 +3097,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -3130,9 +3109,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
@@ -3144,9 +3122,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[...
[truncated]
|
if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 || | ||
SrcVT == MVT::v4i32) && | ||
Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { | ||
if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does vNi16/f16 also work?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we have generic X86VBroadcastld16 patterns that will hit on any i16/f16/bf16 load (AVX2+) - and we shouldn't have introduced a X86ISD::VBROADCAST node on pre-AVX2 targets unless the equivalent X86ISD::VBROADCAST_LOAD is supported.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There's no need for a AVX1-only 32/64-bit scalar size limit - if the X86ISD::VBROADCAST node type is supported, X86ISD::VBROADCAST_LOAD will be as well.