llvm · KanRobert · Jun 5, 2024 · May 28, 2024 · May 30, 2024 · Jun 3, 2024
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -65,6 +65,7 @@ STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
 STATISTIC(NumTestsInserted, "Number of test instructions inserted");
 STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
+STATISTIC(NumNFsConvertedTo, "Number of NF instructions converted to");
 
 namespace {
 
@@ -235,6 +236,19 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
   return NewMBB;
 }
 
+enum EFLAGSClobber { NoClobber, EvitableClobber, InevitableClobber };
+
+static EFLAGSClobber getClobberType(const MachineInstr &MI) {
+  const MachineOperand *FlagDef =
+      MI.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
+  if (!FlagDef)
+    return NoClobber;
+  if (FlagDef->isDead() && X86::getNFVariant(MI.getOpcode()))
+    return EvitableClobber;
+
+  return InevitableClobber;
+}
+
 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
                     << " **********\n");
@@ -254,14 +268,107 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   // turn copied again we visit the first one first. This ensures we can find
   // viable locations for testing the original EFLAGS that dominate all the
   // uses across complex CFGs.
-  SmallVector<MachineInstr *, 4> Copies;
+  SmallSetVector<MachineInstr *, 4> Copies;
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   for (MachineBasicBlock *MBB : RPOT)
     for (MachineInstr &MI : *MBB)
       if (MI.getOpcode() == TargetOpcode::COPY &&
           MI.getOperand(0).getReg() == X86::EFLAGS)
-        Copies.push_back(&MI);
+        Copies.insert(&MI);
+
+  // Try to elminate the copys by transform the instructions between copy and
+  // copydef to the NF (no flags update) variants, e.g.
+  //
+  // %1:gr64 = COPY $eflags
+  // OP1 implicit-def dead $eflags
+  // $eflags = COPY %1
+  // OP2 cc, implicit $eflags
+  //
+  // ->
+  //
+  // OP1_NF
+  // OP2 implicit $eflags
+  if (Subtarget->hasNF()) {
+    SmallSetVector<MachineInstr *, 4> RemovedCopies;
+    // CopyIIt may be invalidated by removing copies.
+    auto CopyIIt = Copies.begin(), CopyIEnd = Copies.end();
+    while (CopyIIt != CopyIEnd) {
+      auto NCopyIIt = std::next(CopyIIt);
+      SmallSetVector<MachineInstr *, 4> EvitableClobbers;
+      MachineInstr *CopyI = *CopyIIt;
+      MachineOperand &VOp = CopyI->getOperand(1);
+      MachineInstr *CopyDefI = MRI->getVRegDef(VOp.getReg());
+      MachineBasicBlock *CopyIMBB = CopyI->getParent();
+      MachineBasicBlock *CopyDefIMBB = CopyDefI->getParent();
+      // Walk all basic blocks reachable in depth-first iteration on the inverse
+      // CFG from CopyIMBB to CopyDefIMBB. These blocks are all the blocks that
+      // may be executed between the execution of CopyDefIMBB and CopyIMBB. On
+      // all execution paths, instructions from CopyDefI to CopyI (exclusive)
+      // has to be NF-convertible if it clobbers flags.
+      for (auto BI = idf_begin(CopyIMBB), BE = idf_end(CopyDefIMBB); BI != BE;
+           ++BI) {
+        MachineBasicBlock *MBB = *BI;
+        for (auto I = (MBB != CopyDefIMBB)
+                          ? MBB->begin()
+                          : std::next(MachineBasicBlock::iterator(CopyDefI)),
+                  E = (MBB != CopyIMBB) ? MBB->end()
+                                        : MachineBasicBlock::iterator(CopyI);
+             I != E; ++I) {
+          MachineInstr &MI = *I;
+          EFLAGSClobber ClobberType = getClobberType(MI);
+          if (ClobberType == NoClobber)
+            continue;
+
+          if (ClobberType == InevitableClobber)
+            goto ProcessNextCopyI;
+
+          assert(ClobberType == EvitableClobber && "unexpected workflow");
+          EvitableClobbers.insert(&MI);
+        }
+      }
+      // Covert evitable clobbers into NF variants and remove the copyies.
+      RemovedCopies.insert(CopyI);
+      CopyI->eraseFromParent();
+      if (MRI->use_nodbg_empty(CopyDefI->getOperand(0).getReg())) {
+        RemovedCopies.insert(CopyDefI);
+        CopyDefI->eraseFromParent();
+      }
+      ++NumCopiesEliminated;
+      for (auto *Clobber : EvitableClobbers) {
+        unsigned NewOpc = X86::getNFVariant(Clobber->getOpcode());
+        assert(NewOpc && "evitable clobber must have a NF variant");
+        Clobber->setDesc(TII->get(NewOpc));
+        Clobber->removeOperand(
+            Clobber->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr)
+                ->getOperandNo());
+        ++NumNFsConvertedTo;
+      }
+      // Update liveins for basic blocks in the path
+      for (auto BI = idf_begin(CopyIMBB), BE = idf_end(CopyDefIMBB); BI != BE;
+           ++BI)
+        if (*BI != CopyDefIMBB)
+          BI->addLiveIn(X86::EFLAGS);
+    ProcessNextCopyI:
+      CopyIIt = NCopyIIt;
+    }
+    Copies.set_subtract(RemovedCopies);
+  }
 
+  // For the rest of copies that cannot be eliminated by NF transform, we use
+  // setcc to preserve the flags in GPR32 before OP1, and recheck its value
+  // before using the flags, e.g.
+  //
+  // %1:gr64 = COPY $eflags
+  // OP1 implicit-def dead $eflags
+  // $eflags = COPY %1
+  // OP2 cc, implicit $eflags
+  //
+  // ->
+  //
+  // %1:gr8 = SETCCr cc, implicit $eflags
+  // OP1 implicit-def dead $eflags
+  // TEST8rr %1, %1, implicit-def $eflags
+  // OP2 ne, implicit $eflags
   for (MachineInstr *CopyI : Copies) {
     MachineBasicBlock &MBB = *CopyI->getParent();
 

diff --git a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+nf | FileCheck %s
+
+define i32 @flag_copy_1(i32 %x, i32 %y, ptr %pz) nounwind {
+; CHECK-LABEL: flag_copy_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    mull %esi
+; CHECK-NEXT:    movl (%rcx), %ecx
+; CHECK-NEXT:    {nf} addl %eax, %ecx
+; CHECK-NEXT:    cmovol %ecx, %eax
+; CHECK-NEXT:    retq
+  %o = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %v1 = extractvalue { i32, i1 } %o, 1
+  %v2 = extractvalue { i32, i1 } %o, 0
+  %z = load i32, ptr %pz
+  %a = add i32 %v2, %z
+  %r = select i1 %v1, i32 %a, i32 %v2
+  ret i32 %r
+}
+
+declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>)
+
+define <2 x i128> @flag_copy_2(<2 x i128> %x, <2 x i128> %y) nounwind {
+; CHECK-LABEL: flag_copy_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT:    movq %r8, %rdi
+; CHECK-NEXT:    {nf} sarq $63, %rdi
+; CHECK-NEXT:    cmovoq %rdi, %rcx
+; CHECK-NEXT:    movabsq $-9223372036854775808, %r9 # imm = 0x8000000000000000
+; CHECK-NEXT:    {nf} xorq %r9, %rdi
+; CHECK-NEXT:    cmovnoq %r8, %rdi
+; CHECK-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    {nf} sarq $63, %r8
+; CHECK-NEXT:    cmovoq %r8, %rsi
+; CHECK-NEXT:    {nf} xorq %r9, %r8
+; CHECK-NEXT:    cmovnoq %rdx, %r8
+; CHECK-NEXT:    movq %rcx, 16(%rax)
+; CHECK-NEXT:    movq %rsi, (%rax)
+; CHECK-NEXT:    movq %rdi, 24(%rax)
+; CHECK-NEXT:    movq %r8, 8(%rax)
+; CHECK-NEXT:    retq
+  %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
+  ret <2 x i128> %z
+}
+
+; TODO: Remove the 2nd cmpl by using NF imul.
+define void @flag_copy_3(i32 %x, i32 %y, ptr %pa, ptr %pb, ptr %pc) nounwind {
+; CHECK-LABEL: flag_copy_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jl .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %bb1
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    imull %esi, %eax
+; CHECK-NEXT:    movl %eax, (%rdx)
+; CHECK-NEXT:    jmp .LBB2_3
+; CHECK-NEXT:  .LBB2_2: # %bb2
+; CHECK-NEXT:    leal -2(%rsi), %eax
+; CHECK-NEXT:    movl %eax, (%rcx)
+; CHECK-NEXT:  .LBB2_3: # %bb3
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    cmovgel %edi, %esi
+; CHECK-NEXT:    movl %esi, (%r8)
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp sgt i32 %x, 1
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+  %add = mul nuw nsw i32 %x, %y
+  store i32 %add, ptr %pa
+  br label %bb3
+
+bb2:
+  %sub = sub nuw nsw i32 %y, 2
+  store i32 %sub, ptr %pb
+  br label %bb3
+
+bb3:
+  %s = select i1 %cmp, i32 %x, i32 %y
+  store i32 %s, ptr %pc
+  ret void
+}