llvm
diff --git a/‎llvm/include/llvm/CodeGen/TargetLowering.h
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/CodeGen/TargetLowering.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 10 additions & 9 deletions b/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 10 additions & 9 deletions
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Lines changed: 6 additions & 4 deletions b/‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Lines changed: 6 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 4 additions & 4 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Lines changed: 27 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Lines changed: 27 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 33 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 33 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
Lines changed: 113 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
Lines changed: 113 additions & 0 deletions
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
   /// Return true if it's profitable to narrow operations of type SrcVT to
   /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
   /// i32 to i16.
-  virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+  virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
     return false;
   }
 
 
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
         TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
         TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
-        TLI.isNarrowingProfitable(VT, SrcVT))
+        TLI.isNarrowingProfitable(N, VT, SrcVT))
       return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
                          DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
                                      DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
   // ShLeftAmt will indicate how much a narrowed load should be shifted left.
   unsigned ShLeftAmt = 0;
   if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
-      ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+      ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
     if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       ShLeftAmt = N01->getZExtValue();
       N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   }
 
   // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
-  if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
-    if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
-        TLI.isTruncateFree(SrcVT, VT)) {
+  if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
+      TLI.isTruncateFree(SrcVT, VT)) {
+    if (!LegalOperations ||
+        (TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
+         TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
       SDLoc SL(N0);
       SDValue Cond = N0.getOperand(0);
       SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     // The narrowing should be profitable, the load/store operation should be
     // legal (or custom) and the store size should be equal to the NewVT width.
-    while (NewBW < BitWidth &&
-           (NewVT.getStoreSizeInBits() != NewBW ||
-            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
-            !TLI.isNarrowingProfitable(VT, NewVT))) {
+    while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
+                                !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+                                !TLI.isNarrowingProfitable(N, VT, NewVT))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
 
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
         for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
              SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
           EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
-          if (isNarrowingProfitable(VT, SmallVT) &&
+          if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
               isTypeDesirableForOp(ISD::SHL, SmallVT) &&
               isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
               (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
           DemandedBits.countLeadingOnes() >= HalfWidth) {
         EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
-        if (isNarrowingProfitable(VT, HalfVT) &&
+        if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
             isTypeDesirableForOp(ISD::SHL, HalfVT) &&
             isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
             (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((BitWidth % 2) == 0 && !VT.isVector()) {
         APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
         EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
-        if (isNarrowingProfitable(VT, HalfVT) &&
+        if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
             isTypeDesirableForOp(ISD::SRL, HalfVT) &&
             isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
             (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       case ISD::SETULT:
       case ISD::SETULE: {
         EVT newVT = N0.getOperand(0).getValueType();
+        // FIXME: Should use isNarrowingProfitable.
         if (DCI.isBeforeLegalizeOps() ||
             (isOperationLegal(ISD::SETCC, newVT) &&
-             isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+             isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
+             isTypeDesirableForOp(ISD::SETCC, newVT))) {
           EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
           SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
 
 
@@ -46,10 +46,10 @@ static cl::opt<bool> WidenLoads(
   cl::init(false));
 
 static cl::opt<bool> Widen16BitOps(
-  "amdgpu-codegenprepare-widen-16-bit-ops",
-  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
-  cl::ReallyHidden,
-  cl::init(true));
+    "amdgpu-codegenprepare-widen-16-bit-ops",
+    cl::desc(
+        "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+    cl::ReallyHidden, cl::init(false));
 
 static cl::opt<bool>
     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
 
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
 
 } // End Predicates = [NotHasMed3_16]
 
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
+          (pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_binops : GICombineRule<
+  (defs root:$dst),
+  (match (promote_i16_uniform_binops_frag i16:$dst):$mi,
+    [{ return matchPromote16to32(*${mi}); }]),
+  (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op, [G_ICMP, G_SELECT],
+          (pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+  (defs root:$dst),
+  (match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
+    [{ return matchPromote16to32(*${mi}); }]),
+  (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+
 // Combines which should only apply on SI/CI
 def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
 
@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
   [unmerge_merge, unmerge_cst, unmerge_undef,
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
-   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+   promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
 }
@@ -1017,14 +1017,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
   return Src == MVT::i32 && Dest == MVT::i64;
 }
 
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+                                                 EVT DestVT) const {
+  switch (N->getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::MUL:
+  case ISD::SETCC:
+  case ISD::SELECT:
+    if (Subtarget->has16BitInsts() &&
+        (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
+      // Don't narrow back down to i16 if promoted to i32 already.
+      if (!N->isDivergent() && DestVT.isInteger() &&
+          DestVT.getScalarSizeInBits() > 1 &&
+          DestVT.getScalarSizeInBits() <= 16 &&
+          SrcVT.getScalarSizeInBits() > 16) {
+        return false;
+      }
+    }
+    return true;
+  default:
+    break;
+  }
+
   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
   // limited number of native 64-bit operations. Shrinking an operation to fit
   // in a single 32-bit register should always be helpful. As currently used,
   // this is much less general than the name suggests, and is only used in
   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
   // not profitable, and may actually be harmful.
-  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+  if (isa<LoadSDNode>(N))
+    return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+
+  return true;
 }
 
 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
 
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
                                NegatibleCost &Cost,
                                unsigned Depth) const override;
 
-  bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+  bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
 
   bool isDesirableToCommuteWithShift(const SDNode *N,
                                      CombineLevel Level) const override;
 
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
   void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
   void applyClamp(MachineInstr &MI, Register &Reg) const;
 
+  bool matchPromote16to32(MachineInstr &MI) const;
+  void applyPromote16to32(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
   return false;
 }
 
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  const auto *RB = MRI.getRegBankOrNull(Dst);
+
+  // Only promote uniform instructions.
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+    return false;
+
+  // Promote only if:
+  //    - We have 16 bit insts (not true 16 bit insts).
+  //    - We don't have packed instructions (for vector types only).
+  // TODO: For vector types, the set of packed operations is more limited, so
+  // may want to promote some anyway.
+  return STI.has16BitInsts() &&
+         (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::G_ASHR:
+    return AMDGPU::G_SEXT;
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SUB:
+  case AMDGPU::G_FSHR:
+    return AMDGPU::G_ZEXT;
+  case AMDGPU::G_AND:
+  case AMDGPU::G_OR:
+  case AMDGPU::G_XOR:
+  case AMDGPU::G_SHL:
+  case AMDGPU::G_SELECT:
+  case AMDGPU::G_MUL:
+    // operation result won't be influenced by garbage high bits.
+    // TODO: are all of those cases correct, and are there more?
+    return AMDGPU::G_ANYEXT;
+  case AMDGPU::G_ICMP: {
+    return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+                                                        : AMDGPU::G_ZEXT;
+  }
+  default:
+    llvm_unreachable("unexpected opcode!");
+  }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+  const unsigned Opc = MI.getOpcode();
+  assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
+         Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
+         Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
+         Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
+         Opc == AMDGPU::G_ICMP);
+
+  Register Dst = MI.getOperand(0).getReg();
+
+  bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+  Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+  Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+  assert(MRI.getType(Dst) == LLT::scalar(16));
+  assert(MRI.getType(LHS) == LLT::scalar(16));
+  assert(MRI.getType(RHS) == LLT::scalar(16));
+
+  assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
+  assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
+  assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+  const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+
+  LLT S32 = LLT::scalar(32);
+
+  B.setInstrAndDebugLoc(MI);
+  const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+  LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+  RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
+
+  MRI.setRegBank(LHS, RB);
+  MRI.setRegBank(RHS, RB);
+
+  MachineInstr *NewInst;
+  if (IsSelectOrCmp)
+    NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+  else
+    NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+  if (Opc != AMDGPU::G_ICMP) {
+    Register Dst32 = NewInst->getOperand(0).getReg();
+    MRI.setRegBank(Dst32, RB);
+    B.buildTrunc(Dst, Dst32);
+  }
+
+  switch (Opc) {
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SHL:
+    NewInst->setFlag(MachineInstr::NoUWrap);
+    NewInst->setFlag(MachineInstr::NoSWrap);
+    break;
+  case AMDGPU::G_SUB:
+    if (MI.getFlag(MachineInstr::NoUWrap))
+      NewInst->setFlag(MachineInstr::NoUWrap);
+    NewInst->setFlag(MachineInstr::NoSWrap);
+    break;
+  case AMDGPU::G_MUL:
+    NewInst->setFlag(MachineInstr::NoUWrap);
+    if (MI.getFlag(MachineInstr::NoUWrap))
+      NewInst->setFlag(MachineInstr::NoUWrap);
+    break;
+  }
+
+  MI.eraseFromParent();
+}
+
 void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
                                            Register &Reg) const {
   B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
Original file line number	Diff line number	Diff line change
`@@ -3299,7 +3299,7 @@ class TargetLoweringBase {`
`3299`	`3299`	`/// Return true if it's profitable to narrow operations of type SrcVT to`
`3300`	`3300`	`/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from`
`3301`	`3301`	`/// i32 to i16.`
`3302`		`- virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {`
	`3302`	`+ virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {`
`3303`	`3303`	`return false;`
`3304`	`3304`	`}`
`3305`	`3305`