Skip to content

Commit e499ae5

Browse files
authored
[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS (#76485)
1 parent 13cdee9 commit e499ae5

File tree

4 files changed

+69
-35
lines changed

4 files changed

+69
-35
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
22672267
setOperationAction(ISD::FDIV, VT, Expand);
22682268
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
22692269
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2270+
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
2271+
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
22702272
}
22712273
setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
22722274
addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
@@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
22822284
setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
22832285
setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
22842286
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
2287+
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);
2288+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
22852289
}
22862290

22872291
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7909,6 +7909,9 @@ let Predicates = [HasAVX2, NoVLX] in {
79097909
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
79107910
}
79117911

7912+
let Predicates = [HasAVXNECONVERT, NoVLX] in
7913+
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, v16bf16, loadv8bf16, loadv16bf16>;
7914+
79127915
//===----------------------------------------------------------------------===//
79137916
// VEXTRACTI128 - Extract packed integer values
79147917
//
@@ -7931,6 +7934,9 @@ let Predicates = [HasAVX2, NoVLX] in {
79317934
defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
79327935
}
79337936

7937+
let Predicates = [HasAVXNECONVERT, NoVLX] in
7938+
defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;
7939+
79347940
//===----------------------------------------------------------------------===//
79357941
// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
79367942
//

llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll

Lines changed: 5 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -372,31 +372,11 @@ entry:
372372

373373
;; FIXME: This should generate the same output as above, but let's fix the crash first.
374374
define <16 x bfloat> @test_no_vbroadcast2() nounwind {
375-
; X86-LABEL: test_no_vbroadcast2:
376-
; X86: # %bb.0: # %entry
377-
; X86-NEXT: pushl %ebp # encoding: [0x55]
378-
; X86-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5]
379-
; X86-NEXT: andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
380-
; X86-NEXT: subl $64, %esp # encoding: [0x83,0xec,0x40]
381-
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
382-
; X86-NEXT: vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
383-
; X86-NEXT: vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
384-
; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec]
385-
; X86-NEXT: popl %ebp # encoding: [0x5d]
386-
; X86-NEXT: retl # encoding: [0xc3]
387-
;
388-
; X64-LABEL: test_no_vbroadcast2:
389-
; X64: # %bb.0: # %entry
390-
; X64-NEXT: pushq %rbp # encoding: [0x55]
391-
; X64-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
392-
; X64-NEXT: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
393-
; X64-NEXT: subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
394-
; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
395-
; X64-NEXT: vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
396-
; X64-NEXT: vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
397-
; X64-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
398-
; X64-NEXT: popq %rbp # encoding: [0x5d]
399-
; X64-NEXT: retq # encoding: [0xc3]
375+
; CHECK-LABEL: test_no_vbroadcast2:
376+
; CHECK: # %bb.0: # %entry
377+
; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
378+
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
379+
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
400380
entry:
401381
%0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
402382
%1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer

llvm/test/CodeGen/X86/bfloat.ll

Lines changed: 54 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
22122212
;
22132213
; AVXNC-LABEL: fptrunc_v16f32:
22142214
; AVXNC: # %bb.0:
2215-
; AVXNC-NEXT: pushq %rbp
2216-
; AVXNC-NEXT: movq %rsp, %rbp
2217-
; AVXNC-NEXT: andq $-32, %rsp
2218-
; AVXNC-NEXT: subq $64, %rsp
2219-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
2220-
; AVXNC-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
22212215
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
2222-
; AVXNC-NEXT: vmovaps %xmm0, (%rsp)
2223-
; AVXNC-NEXT: vmovaps (%rsp), %ymm0
2224-
; AVXNC-NEXT: movq %rbp, %rsp
2225-
; AVXNC-NEXT: popq %rbp
2216+
; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0
2217+
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
2218+
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
22262219
; AVXNC-NEXT: retq
22272220
%b = fptrunc <16 x float> %a to <16 x bfloat>
22282221
ret <16 x bfloat> %b
@@ -2485,3 +2478,54 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
24852478
%3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
24862479
ret <32 x bfloat> %3
24872480
}
2481+
2482+
define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
2483+
; SSE2-LABEL: concat_v8bf16:
2484+
; SSE2: # %bb.0:
2485+
; SSE2-NEXT: retq
2486+
;
2487+
; AVX-LABEL: concat_v8bf16:
2488+
; AVX: # %bb.0:
2489+
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2490+
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2491+
; AVX-NEXT: retq
2492+
%a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2493+
ret <16 x bfloat> %a
2494+
}
2495+
2496+
define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
2497+
; SSE2-LABEL: extract_v32bf16_v8bf16:
2498+
; SSE2: # %bb.0:
2499+
; SSE2-NEXT: pextrw $0, %xmm1, %eax
2500+
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
2501+
; SSE2-NEXT: shll $16, %ecx
2502+
; SSE2-NEXT: orl %eax, %ecx
2503+
; SSE2-NEXT: pextrw $2, %xmm1, %eax
2504+
; SSE2-NEXT: pextrw $3, %xmm1, %edx
2505+
; SSE2-NEXT: shll $16, %edx
2506+
; SSE2-NEXT: orl %eax, %edx
2507+
; SSE2-NEXT: shlq $32, %rdx
2508+
; SSE2-NEXT: orq %rcx, %rdx
2509+
; SSE2-NEXT: pextrw $4, %xmm1, %eax
2510+
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
2511+
; SSE2-NEXT: shll $16, %ecx
2512+
; SSE2-NEXT: orl %eax, %ecx
2513+
; SSE2-NEXT: pextrw $6, %xmm1, %eax
2514+
; SSE2-NEXT: pextrw $7, %xmm1, %esi
2515+
; SSE2-NEXT: shll $16, %esi
2516+
; SSE2-NEXT: orl %eax, %esi
2517+
; SSE2-NEXT: shlq $32, %rsi
2518+
; SSE2-NEXT: orq %rcx, %rsi
2519+
; SSE2-NEXT: movq %rsi, %xmm1
2520+
; SSE2-NEXT: movq %rdx, %xmm0
2521+
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2522+
; SSE2-NEXT: retq
2523+
;
2524+
; AVX-LABEL: extract_v32bf16_v8bf16:
2525+
; AVX: # %bb.0:
2526+
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
2527+
; AVX-NEXT: vzeroupper
2528+
; AVX-NEXT: retq
2529+
%a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2530+
ret <8 x bfloat> %a
2531+
}

0 commit comments

Comments
 (0)