Skip to content

Commit 6ca9676

Browse files
committed
[X86] Disable commuting for the first source operand of zero masked scalar fma intrinsic instructions.
I believe this is the correct fix for D75506 rather than disabling all commuting. We can still commute the remaining two sources. Differential Revision:m https://reviews.llvm.org/D75526
1 parent a0e8642 commit 6ca9676

File tree

2 files changed

+31
-2
lines changed

2 files changed

+31
-2
lines changed

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1883,7 +1883,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
18831883
unsigned KMaskOp = -1U;
18841884
if (X86II::isKMasked(TSFlags)) {
18851885
// For k-zero-masked operations it is Ok to commute the first vector
1886-
// operand.
1886+
// operand. Unless this is an intrinsic instruction.
18871887
// For regular k-masked operations a conservative choice is done as the
18881888
// elements of the first vector operand, for which the corresponding bit
18891889
// in the k-mask operand is set to 0, are copied to the result of the
@@ -1902,7 +1902,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
19021902

19031903
// The operand with index = 1 is used as a source for those elements for
19041904
// which the corresponding bit in the k-mask is set to 0.
1905-
if (X86II::isKMergeMasked(TSFlags))
1905+
if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
19061906
FirstCommutableVecOp = 3;
19071907

19081908
LastCommutableVecOp++;

llvm/test/CodeGen/X86/avx512-intrinsics.ll

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5818,6 +5818,35 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo
58185818
ret <4 x float> %res2
58195819
}
58205820

5821+
; Make sure we don't commute this to fold the load as that source isn't commutable.
5822+
define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, <4 x float>* nocapture readonly %1, float %2, float %3) {
5823+
; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0:
5824+
; X64: # %bb.0:
5825+
; X64-NEXT: vmovaps (%rsi), %xmm2
5826+
; X64-NEXT: kmovw %edi, %k1
5827+
; X64-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm0 * xmm2) + xmm1
5828+
; X64-NEXT: vmovaps %xmm2, %xmm0
5829+
; X64-NEXT: retq
5830+
;
5831+
; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0:
5832+
; X86: # %bb.0:
5833+
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
5834+
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5835+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
5836+
; X86-NEXT: vmovaps (%ecx), %xmm0
5837+
; X86-NEXT: kmovw %eax, %k1
5838+
; X86-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
5839+
; X86-NEXT: retl
5840+
%5 = load <4 x float>, <4 x float>* %1, align 16
5841+
%6 = extractelement <4 x float> %5, i64 0
5842+
%7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2
5843+
%8 = bitcast i8 %0 to <8 x i1>
5844+
%9 = extractelement <8 x i1> %8, i64 0
5845+
%10 = select i1 %9, float %7, float 0.000000e+00
5846+
%11 = insertelement <4 x float> %5, float %10, i64 0
5847+
ret <4 x float> %11
5848+
}
5849+
58215850
define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
58225851
; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
58235852
; X64: # %bb.0:

0 commit comments

Comments
 (0)