[X86] Remove single-use checks when combining xor and vfmulc/vcfmulc. (#128910)

daniel-zabawa · web-flow · commit 415f89905fa0 · 2025-03-04T09:35:37.000+08:00
The current implementation to combine xor patterns for conjugation with
complex multiplies will not perform the transformation when either the
conjugate xor result or other multiplicand have other uses. This change
eliminates both single-use checks.

The transformation will eliminate the xor dependence and hence should be
profitable even if the conjugate is used elsewhere - and more profitable
if the xor is used in multiple fmulc/fcmulc instructions, eventually
going dead.

The check of the other multiplicand isn't required for correctness and
has no apparent performance implications.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53633,9 +53633,9 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
   int CombineOpcode =
       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
   auto combineConjugation = [&](SDValue &r) {
-    if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
+    if (LHS->getOpcode() == ISD::BITCAST) {
       SDValue XOR = LHS.getOperand(0);
-      if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
+      if (XOR->getOpcode() == ISD::XOR) {
         KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
         if (XORRHS.isConstant()) {
           APInt ConjugationInt32 = APInt(32, 0x80000000);
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -83,6 +83,27 @@ entry:
   ret <32 x half> %3
 }
 
+define dso_local <32 x half> @test6(<16 x i32> %a, <16 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vfcmulcph %zmm0, %zmm1, %zmm3
+; CHECK-NEXT:    vfcmaddcph %zmm0, %zmm2, %zmm3
+; CHECK-NEXT:    vaddph %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = xor <16 x i32> %a, splat (i32 -2147483648)
+  %1 = bitcast <16 x i32> %0 to <16 x float>
+  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %3 = bitcast <16 x float> %2 to <32 x half>
+  %4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %b, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %5 = bitcast <16 x float> %4 to <32 x half>
+  %6 = fadd <32 x half> %3, %5
+  %7 = bitcast <16 x float> %b to <32 x half>
+  %8 = fadd <32 x half> %6, %7
+  ret <32 x half> %8
+}
+
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)