-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Enable custom lowering of fabs_v16f16 with AVX and fabs_v32f16 with A… #73565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: David Li (david-xl) ChangesThis is the last patch for fabs lowering. v32f16 works for AVX as well with the patch (with type legalization). Patch is 138.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73565.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d0e51301945ecb5..7cfd48e283b47aa 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1596,8 +1596,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STORE, VT, Custom);
}
setF16Action(MVT::v16f16, Expand);
- if (Subtarget.hasAVX2())
- setOperationAction(ISD::FABS, MVT::v16f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v16f16, Custom);
setOperationAction(ISD::FADD, MVT::v16f16, Expand);
setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
@@ -2054,6 +2053,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
+ setOperationAction(ISD::FABS, MVT::v32f16, Custom);
}
// This block control legalization of v32i1/v64i1 which are available with
diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index 7e915c9ee040764..ec02dfda30c8502 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -282,235 +282,9 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
define <16 x half> @fabs_v16f16(ptr %p) {
; X86-AVX1-LABEL: fabs_v16f16:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT: subl $308, %esp # imm = 0x134
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 316
-; X86-AVX1-NEXT: .cfi_offset %esi, -8
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-AVX1-NEXT: vmovdqa (%esi), %xmm0
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovaps 16(%esi), %xmm1
-; X86-AVX1-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 4(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 8(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 12(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 20(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 24(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 28(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm1, (%esp)
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X86-AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
-; X86-AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: addl $308, %esp # imm = 0x134
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: vmovaps (%eax), %ymm0
+; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: fabs_v16f16:
@@ -529,135 +303,8 @@ define <16 x half> @fabs_v16f16(ptr %p) {
;
; X64-AVX1-LABEL: fabs_v16f16:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rbx
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX1-NEXT: subq $80, %rsp
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 96
-; X64-AVX1-NEXT: .cfi_offset %rbx, -16
-; X64-AVX1-NEXT: movq %rdi, %rbx
-; X64-AVX1-NEXT: vbroadcastss 28(%rdi), %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vmovaps (%rbx), %xmm0
-; X64-AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa 16(%rbx), %xmm0
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vbroadcastss 24(%rbx), %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vbroadcastss 20(%rbx), %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; X64-AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2@PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2@PLT
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X64-AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64...
[truncated]
|
AVX512 v32f16 still looks very poor |
Do you mean the sequence below? ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax Compared with the AVX2 version, this one materializes a 512 bit mask and do one vpandq instead of using 256bit mask with two vpandd. If there are things to improve here, we should probably do it as a follow up. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
This is the last patch for fabs lowering. v32f16 works for AVX as well with the patch (with type legalization).