[AMDGPU] Add a lit test for hasAndNot.

harrisonGPU · harrisonGPU · commit 2c9aa28f7548 · 2024-10-18T13:21:18.000+08:00
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6822,6 +6822,81 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
   }
 }
 
+SDValue SITargetLowering::combineAnd(SDValue Op,
+                                                DAGCombinerInfo &DCI) const {
+  const unsigned Opc = Op.getOpcode();
+  assert(Opc == ISD::AND);
+
+  auto &DAG = DCI.DAG;
+  SDLoc DL(Op);
+
+  if(hasAndNot(Op)) {
+    SDValue LHS = Op->getOperand(0);
+    SDValue RHS = Op->getOperand(1);
+
+    // (and LHS, (or Y, ~Z))
+    if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
+      SDValue Y = RHS->getOperand(0);
+      SDValue NotZ = RHS->getOperand(1);
+
+      if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+        SDValue Z = NotZ->getOperand(0);
+
+        if (!isa<ConstantSDNode>(Y)) {
+          SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
+          SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+          SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+          SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+          return NewAnd;
+        }
+      }
+    }
+  }
+    
+  EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+                                 : Op->getOperand(0).getValueType();
+  auto ExtTy = OpTy.changeElementType(MVT::i32);
+
+  if (DCI.isBeforeLegalizeOps() ||
+      isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
+    return SDValue();
+
+  SDValue LHS;
+  SDValue RHS;
+  if (Opc == ISD::SELECT) {
+    LHS = Op->getOperand(1);
+    RHS = Op->getOperand(2);
+  } else {
+    LHS = Op->getOperand(0);
+    RHS = Op->getOperand(1);
+  }
+
+  const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
+  LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+
+  // Special case: for shifts, the RHS always needs a zext.
+  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+    RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
+  else
+    RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+
+  // setcc always return i1/i1 vec so no need to truncate after.
+  if (Opc == ISD::SETCC) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
+  }
+
+  // For other ops, we extend the operation's return type as well so we need to
+  // truncate back to the original type.
+  SDValue NewVal;
+  if (Opc == ISD::SELECT)
+    NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
+  else
+    NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
+
+  return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
+}
+
 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
                                                 DAGCombinerInfo &DCI) const {
   const unsigned Opc = Op.getOpcode();
@@ -14797,16 +14872,19 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
   return SDValue(CSrc, 0);
 }
 
-
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
+  case ISD::AND:
+    if (auto Res = combineAnd(SDValue(N, 0), DCI))
+      return Res;
+    break;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:
-  case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::MUL:
@@ -14910,7 +14988,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CLAMP:
     return performClampCombine(N, DCI);
   case ISD::SCALAR_TO_VECTOR: {
-    SelectionDAG &DAG = DCI.DAG;
     EVT VT = N->getValueType(0);
 
     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -16892,8 +16969,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 }
 
 bool SITargetLowering::hasAndNot(SDValue Op) const {
-  // Return false if the operation is divergent, as AND-NOT optimization
-  // requires uniform behavior across threads.
+  // Return false if the operation is divergent, as AND-NOT is a scalar-only
+  // instruction.
   if (Op->isDivergent())
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i16
+; GCN: s_not_b32
+; GCN-NEXT: s_lshr_b32
+; GCN-NEXT: s_and_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
+entry:
+  %not_z = xor i16 %z, -1
+  %or_y_not_z = or i16 %y, %not_z
+  %and_result = and i16 %x, %or_y_not_z
+  store i16 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i32
+; GCN: s_andn2_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %not_z = xor i32 %z, -1
+  %or_y_not_z = or i32 %y, %not_z
+  %and_result = and i32 %x, %or_y_not_z
+  store i32 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i64
+; GCN: s_andn2_b64
+; GCN-NEXT: s_andn2_b64
+define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
+entry:
+  %not_z = xor i64 %z, -1
+  %or_y_not_z = or i64 %y, %not_z
+  %and_result = and i64 %x, %or_y_not_z
+  store i64 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}