Skip to content

Commit 415f899

Browse files
[X86] Remove single-use checks when combining xor and vfmulc/vcfmulc. (#128910)
The current implementation to combine xor patterns for conjugation with complex multiplies will not perform the transformation when either the conjugate xor result or other multiplicand have other uses. This change eliminates both single-use checks. The transformation will eliminate the xor dependence and hence should be profitable even if the conjugate is used elsewhere - and more profitable if the xor is used in multiple fmulc/fcmulc instructions, eventually going dead. The check of the other multiplicand isn't required for correctness and has no apparent performance implications.
1 parent f38ce27 commit 415f899

File tree

2 files changed

+23
-2
lines changed

2 files changed

+23
-2
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53633,9 +53633,9 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
5363353633
int CombineOpcode =
5363453634
N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
5363553635
auto combineConjugation = [&](SDValue &r) {
53636-
if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53636+
if (LHS->getOpcode() == ISD::BITCAST) {
5363753637
SDValue XOR = LHS.getOperand(0);
53638-
if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53638+
if (XOR->getOpcode() == ISD::XOR) {
5363953639
KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
5364053640
if (XORRHS.isConstant()) {
5364153641
APInt ConjugationInt32 = APInt(32, 0x80000000);

llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,27 @@ entry:
8383
ret <32 x half> %3
8484
}
8585

86+
define dso_local <32 x half> @test6(<16 x i32> %a, <16 x float> %b) local_unnamed_addr #0 {
87+
; CHECK-LABEL: test6:
88+
; CHECK: # %bb.0: # %entry
89+
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
90+
; CHECK-NEXT: vfcmulcph %zmm0, %zmm1, %zmm3
91+
; CHECK-NEXT: vfcmaddcph %zmm0, %zmm2, %zmm3
92+
; CHECK-NEXT: vaddph %zmm1, %zmm3, %zmm0
93+
; CHECK-NEXT: retq
94+
entry:
95+
%0 = xor <16 x i32> %a, splat (i32 -2147483648)
96+
%1 = bitcast <16 x i32> %0 to <16 x float>
97+
%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
98+
%3 = bitcast <16 x float> %2 to <32 x half>
99+
%4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %b, <16 x float> zeroinitializer, i16 -1, i32 4)
100+
%5 = bitcast <16 x float> %4 to <32 x half>
101+
%6 = fadd <32 x half> %3, %5
102+
%7 = bitcast <16 x float> %b to <32 x half>
103+
%8 = fadd <32 x half> %6, %7
104+
ret <32 x half> %8
105+
}
106+
86107
declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
87108
declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
88109
declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)

0 commit comments

Comments
 (0)