Skip to content

Commit f738150

Browse files
committed
[X86] Disable lowering constant build vectors to broadcasts on AVX512 targets
On AVX512 targets we're better off keeping constant vector at full width to ensure that they can be load folded into vector instructions, reducing register pressure. If a vector constant remains as a basic load, X86FixupVectorConstantsPass will still convert this to a broadcast instruction for us. Non-VLX targets are still seeing some regressions due to these being implicitly widened to 512-bit ops in isel patterns and not in the DAG, so I've limited this to just 512-bit vectors for now.
1 parent 1d27669 commit f738150

File tree

78 files changed

+33655
-33354
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+33655
-33354
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 105 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -294,20 +294,56 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
294294
case X86::VMOVUPSZ128rm:
295295
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
296296
X86::VBROADCASTSSZ128rm, 0, 0, 1);
297+
case X86::VMOVAPDZ128rmk:
298+
case X86::VMOVUPDZ128rmk:
299+
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rmk, 0, 0, 0, 3);
300+
case X86::VMOVAPSZ128rmk:
301+
case X86::VMOVUPSZ128rmk:
302+
return ConvertToBroadcast(0, 0, 0, X86::VBROADCASTSSZ128rmk, 0, 0, 3);
303+
case X86::VMOVAPDZ128rmkz:
304+
case X86::VMOVUPDZ128rmkz:
305+
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rmkz, 0, 0, 0, 2);
306+
case X86::VMOVAPSZ128rmkz:
307+
case X86::VMOVUPSZ128rmkz:
308+
return ConvertToBroadcast(0, 0, 0, X86::VBROADCASTSSZ128rmkz, 0, 0, 2);
297309
case X86::VMOVAPDZ256rm:
298310
case X86::VMOVAPSZ256rm:
299311
case X86::VMOVUPDZ256rm:
300312
case X86::VMOVUPSZ256rm:
301313
return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
302314
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
303315
0, 0, 1);
316+
case X86::VMOVAPDZ256rmk:
317+
case X86::VMOVUPDZ256rmk:
318+
return ConvertToBroadcast(0, 0, X86::VBROADCASTSDZ256rmk, 0, 0, 0, 3);
319+
case X86::VMOVAPSZ256rmk:
320+
case X86::VMOVUPSZ256rmk:
321+
return ConvertToBroadcast(0, 0, 0, X86::VBROADCASTSSZ256rmk, 0, 0, 3);
322+
case X86::VMOVAPDZ256rmkz:
323+
case X86::VMOVUPDZ256rmkz:
324+
return ConvertToBroadcast(0, 0, X86::VBROADCASTSDZ256rmkz, 0, 0, 0, 2);
325+
case X86::VMOVAPSZ256rmkz:
326+
case X86::VMOVUPSZ256rmkz:
327+
return ConvertToBroadcast(0, 0, 0, X86::VBROADCASTSSZ256rmkz, 0, 0, 2);
304328
case X86::VMOVAPDZrm:
305329
case X86::VMOVAPSZrm:
306330
case X86::VMOVUPDZrm:
307331
case X86::VMOVUPSZrm:
308332
return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
309333
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
310334
1);
335+
case X86::VMOVAPDZrmk:
336+
case X86::VMOVUPDZrmk:
337+
return ConvertToBroadcast(0, 0, X86::VBROADCASTSDZrmk, 0, 0, 0, 3);
338+
case X86::VMOVAPSZrmk:
339+
case X86::VMOVUPSZrmk:
340+
return ConvertToBroadcast(0, 0, 0, X86::VBROADCASTSSZrmk, 0, 0, 3);
341+
case X86::VMOVAPDZrmkz:
342+
case X86::VMOVUPDZrmkz:
343+
return ConvertToBroadcast(0, 0, X86::VBROADCASTSDZrmkz, 0, 0, 0, 2);
344+
case X86::VMOVAPSZrmkz:
345+
case X86::VMOVUPSZrmkz:
346+
return ConvertToBroadcast(0, 0, 0, X86::VBROADCASTSSZrmkz, 0, 0, 2);
311347
/* Integer Loads */
312348
case X86::VMOVDQArm:
313349
case X86::VMOVDQUrm:
@@ -332,6 +368,18 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
332368
X86::VPBROADCASTDZ128rm,
333369
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
334370
HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
371+
case X86::VMOVDQA32Z128rmk:
372+
case X86::VMOVDQU32Z128rmk:
373+
return ConvertToBroadcast(0, 0, 0, X86::VPBROADCASTDZ128rmk, 0, 0, 3);
374+
case X86::VMOVDQA32Z128rmkz:
375+
case X86::VMOVDQU32Z128rmkz:
376+
return ConvertToBroadcast(0, 0, 0, X86::VPBROADCASTDZ128rmkz, 0, 0, 2);
377+
case X86::VMOVDQA64Z128rmk:
378+
case X86::VMOVDQU64Z128rmk:
379+
return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rmk, 0, 0, 0, 3);
380+
case X86::VMOVDQA64Z128rmkz:
381+
case X86::VMOVDQU64Z128rmkz:
382+
return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rmkz, 0, 0, 0, 2);
335383
case X86::VMOVDQA32Z256rm:
336384
case X86::VMOVDQA64Z256rm:
337385
case X86::VMOVDQU32Z256rm:
@@ -340,6 +388,24 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
340388
X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
341389
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
342390
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
391+
case X86::VMOVDQA32Z256rmk:
392+
case X86::VMOVDQU32Z256rmk:
393+
return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rmk,
394+
HasDQI ? X86::VBROADCASTI32X2Z256rmk : 0,
395+
X86::VPBROADCASTDZ256rmk, 0, 0, 3);
396+
case X86::VMOVDQA32Z256rmkz:
397+
case X86::VMOVDQU32Z256rmkz:
398+
return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rmkz,
399+
HasDQI ? X86::VBROADCASTI32X2Z256rmkz : 0,
400+
X86::VPBROADCASTDZ256rmkz, 0, 0, 2);
401+
case X86::VMOVDQA64Z256rmk:
402+
case X86::VMOVDQU64Z256rmk:
403+
return ConvertToBroadcast(0, HasDQI ? X86::VBROADCASTI64X2Z128rmk : 0,
404+
X86::VPBROADCASTQZ256rmk, 0, 0, 0, 3);
405+
case X86::VMOVDQA64Z256rmkz:
406+
case X86::VMOVDQU64Z256rmkz:
407+
return ConvertToBroadcast(0, HasDQI ? X86::VBROADCASTI64X2Z128rmkz : 0,
408+
X86::VPBROADCASTQZ256rmkz, 0, 0, 0, 2);
343409
case X86::VMOVDQA32Zrm:
344410
case X86::VMOVDQA64Zrm:
345411
case X86::VMOVDQU32Zrm:
@@ -348,39 +414,62 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
348414
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
349415
HasBWI ? X86::VPBROADCASTWZrm : 0,
350416
HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
417+
case X86::VMOVDQA32Zrmk:
418+
case X86::VMOVDQU32Zrmk:
419+
return ConvertToBroadcast(
420+
HasDQI ? X86::VBROADCASTI32X8rmk : 0, X86::VBROADCASTI32X4rmk,
421+
HasDQI ? X86::VBROADCASTI32X2Zrmk : 0, X86::VPBROADCASTDZrmk, 0, 0, 3);
422+
case X86::VMOVDQA32Zrmkz:
423+
case X86::VMOVDQU32Zrmkz:
424+
return ConvertToBroadcast(HasDQI ? X86::VBROADCASTI32X8rmkz : 0,
425+
X86::VBROADCASTI32X4rmkz,
426+
HasDQI ? X86::VBROADCASTI32X2Zrmkz : 0,
427+
X86::VPBROADCASTDZrmkz, 0, 0, 2);
428+
case X86::VMOVDQA64Zrmk:
429+
case X86::VMOVDQU64Zrmk:
430+
return ConvertToBroadcast(X86::VBROADCASTI64X4rmk,
431+
HasDQI ? X86::VBROADCASTI64X2rmk : 0,
432+
X86::VPBROADCASTQZrmk, 0, 0, 0, 3);
433+
case X86::VMOVDQA64Zrmkz:
434+
case X86::VMOVDQU64Zrmkz:
435+
return ConvertToBroadcast(X86::VBROADCASTI64X4rmkz,
436+
HasDQI ? X86::VBROADCASTI64X2rmkz : 0,
437+
X86::VPBROADCASTQZrmkz, 0, 0, 0, 2);
351438
}
352439

353-
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
354-
unsigned OpBcst32 = 0, OpBcst64 = 0;
355-
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
440+
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc16, unsigned OpSrc32,
441+
unsigned OpSrc64) {
442+
if (OpSrc16) {
443+
if (const X86FoldTableEntry *Mem2Bcst =
444+
llvm::lookupBroadcastFoldTable(OpSrc16, 16)) {
445+
if (ConvertToBroadcast(0, 0, 0, 0, Mem2Bcst->DstOp, 0,
446+
Mem2Bcst->Flags & TB_INDEX_MASK))
447+
return true;
448+
}
449+
}
356450
if (OpSrc32) {
357451
if (const X86FoldTableEntry *Mem2Bcst =
358452
llvm::lookupBroadcastFoldTable(OpSrc32, 32)) {
359-
OpBcst32 = Mem2Bcst->DstOp;
360-
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
453+
if (ConvertToBroadcast(0, 0, 0, Mem2Bcst->DstOp, 0, 0,
454+
Mem2Bcst->Flags & TB_INDEX_MASK))
455+
return true;
361456
}
362457
}
363458
if (OpSrc64) {
364459
if (const X86FoldTableEntry *Mem2Bcst =
365460
llvm::lookupBroadcastFoldTable(OpSrc64, 64)) {
366-
OpBcst64 = Mem2Bcst->DstOp;
367-
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
461+
if (ConvertToBroadcast(0, 0, Mem2Bcst->DstOp, 0, 0, 0,
462+
Mem2Bcst->Flags & TB_INDEX_MASK))
463+
return true;
368464
}
369465
}
370-
assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
371-
"OperandNo mismatch");
372-
373-
if (OpBcst32 || OpBcst64) {
374-
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
375-
return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
376-
}
377466
return false;
378467
};
379468

380469
// Attempt to find a AVX512 mapping from a full width memory-fold instruction
381470
// to a broadcast-fold instruction variant.
382471
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
383-
return ConvertToBroadcastAVX512(Opc, Opc);
472+
return ConvertToBroadcastAVX512(Opc, Opc, Opc);
384473

385474
// Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
386475
// conversion to see if we can convert to a broadcasted (integer) logic op.
@@ -437,7 +526,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
437526
break;
438527
}
439528
if (OpSrc32 || OpSrc64)
440-
return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
529+
return ConvertToBroadcastAVX512(0, OpSrc32, OpSrc64);
441530
}
442531

443532
return false;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7144,6 +7144,14 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
71447144
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
71457145
"Unsupported vector type for broadcast.");
71467146

7147+
// On AVX512VL targets we're better off keeping the full width constant load
7148+
// and letting X86FixupVectorConstantsPass handle conversion to
7149+
// broadcast/broadcast-fold.
7150+
// AVX512 targets without AVX512VL can do this only for 512-bit vectors.
7151+
if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) &&
7152+
BVOp->isConstant())
7153+
return SDValue();
7154+
71477155
// See if the build vector is a repeating sequence of scalars (inc. splat).
71487156
SDValue Ld;
71497157
BitVector UndefElements;

llvm/lib/Target/X86/X86InstrFoldTables.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,8 @@ static bool matchBroadcastSize(const X86FoldTableEntry &Entry,
297297
case TB_BCAST_SS:
298298
case TB_BCAST_D:
299299
return BroadcastBits == 32;
300+
case TB_BCAST_SH:
301+
return BroadcastBits == 16;
300302
}
301303
return false;
302304
}

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1482,10 +1482,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14821482
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14831483
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14841484
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1485-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1486-
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1487-
; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1488-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1485+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1486+
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
14891487
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
14901488
; AVX512F-NEXT: vzeroupper
14911489
; AVX512F-NEXT: retq
@@ -1497,10 +1495,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14971495
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14981496
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14991497
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1500-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1501-
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1502-
; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1503-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1498+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1499+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
15041500
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
15051501
; AVX512DQ-NEXT: vzeroupper
15061502
; AVX512DQ-NEXT: retq
@@ -3263,10 +3259,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32633259
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
32643260
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32653261
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3266-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3267-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
3268-
; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3269-
; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3262+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3263+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32703264
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32713265
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
32723266
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3280,10 +3274,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32803274
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
32813275
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32823276
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3283-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3284-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
3285-
; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3286-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3277+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3278+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32873279
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32883280
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
32893281
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,21 +1213,19 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
12131213
;
12141214
; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
12151215
; AVX512F: # %bb.0:
1216-
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
1217-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1218-
; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
1219-
; AVX512F-NEXT: vpternlogq $172, 32(%rdi), %ymm0, %ymm1
1216+
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1217+
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1]
1218+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
12201219
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
12211220
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
12221221
; AVX512F-NEXT: vzeroupper
12231222
; AVX512F-NEXT: retq
12241223
;
12251224
; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
12261225
; AVX512DQ: # %bb.0:
1227-
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
1228-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1229-
; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
1230-
; AVX512DQ-NEXT: vpternlogq $172, 32(%rdi), %ymm0, %ymm1
1226+
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1227+
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1]
1228+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
12311229
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
12321230
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
12331231
; AVX512DQ-NEXT: vzeroupper
@@ -2629,10 +2627,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
26292627
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
26302628
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
26312629
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1]
2632-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2633-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
2634-
; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm0, %ymm3
2635-
; AVX512F-NEXT: vpaddb (%rsi), %ymm3, %ymm0
2630+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
2631+
; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0
26362632
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
26372633
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
26382634
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2644,10 +2640,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
26442640
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
26452641
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
26462642
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1]
2647-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2648-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
2649-
; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm0, %ymm3
2650-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm3, %ymm0
2643+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
2644+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0
26512645
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
26522646
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
26532647
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3671,10 +3671,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
36713671
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
36723672
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
36733673
; CHECK: # %bb.0:
3674-
; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3675-
; CHECK-NEXT: # xmm2 = mem[0,0]
3676-
; CHECK-NEXT: vmovaps 32(%rdi), %ymm3
3677-
; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3
3674+
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3675+
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [4,14,4,14,4,14,4,14]
3676+
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
36783677
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
36793678
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
36803679
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
@@ -3690,12 +3689,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
36903689
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
36913690
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
36923691
; CHECK: # %bb.0:
3693-
; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3694-
; CHECK-NEXT: # xmm2 = mem[0,0]
3695-
; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3692+
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3693+
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4,14,4,14,4,14,4,14]
36963694
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
36973695
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3698-
; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3696+
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
36993697
; CHECK-NEXT: vmovaps %xmm1, %xmm0
37003698
; CHECK-NEXT: vzeroupper
37013699
; CHECK-NEXT: retq

0 commit comments

Comments
 (0)