llvm
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Lines changed: 32 additions & 0 deletions b/‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Lines changed: 32 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
Lines changed: 2 additions & 1 deletion b/‎llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
Lines changed: 2 additions & 1 deletion
@@ -1784,6 +1784,38 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
 
+      // Narrow shift to lower half - similar to ShrinkDemandedOp.
+      // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
+      unsigned HalfWidth = BitWidth / 2;
+      if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth) {
+        EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
+        if (isNarrowingProfitable(VT, HalfVT) &&
+            isTypeDesirableForOp(ISD::SHL, HalfVT) &&
+            isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
+            (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT))) {
+          // Unless we aren't demanding the upper bits at all, we must ensure
+          // that the upper bits of the shift result are known to be zero,
+          // which is equivalent to the narrow shift being NUW.
+          KnownBits Known0 = TLO.DAG.computeKnownBits(Op0, Depth + 1);
+          bool IsNUW = Known0.countMinLeadingZeros() >= (ShAmt + HalfWidth);
+          if (IsNUW || DemandedBits.countLeadingZeros() >= HalfWidth) {
+            unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, Depth + 1);
+            bool IsNSW = NumSignBits > (ShAmt + HalfWidth);
+            SDNodeFlags Flags;
+            Flags.setNoSignedWrap(IsNSW);
+            Flags.setNoUnsignedWrap(IsNUW);
+            SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0);
+            SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant(
+                ShAmt, HalfVT, dl, TLO.LegalTypes());
+            SDValue NewShift = TLO.DAG.getNode(ISD::SHL, dl, HalfVT, NewOp,
+                                               NewShiftAmt, Flags);
+            SDValue NewExt =
+                TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift);
+            return TLO.CombineTo(Op, NewExt);
+          }
+        }
+      }
+
       APInt InDemandedMask = DemandedBits.lshr(ShAmt);
       if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
                                Depth + 1))
 
@@ -32,7 +32,8 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
 ; the base may be the RHS operand of the load in SDAG.
 ; GCN-LABEL: name: test_complex_reg_offset
 ; GCN-DAG: %[[BASE:.*]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @0 + 4,
-; GCN-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
+; SDAG-DAG: %[[OFFSET:.*]]:sreg_32 = nuw nsw S_LSHL_B32
+; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
 ; SDAG: S_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[OFFSET]], 0, 0
 ; GISEL: S_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 0, 0
 define amdgpu_ps void @test_complex_reg_offset(ptr addrspace(1) %out) {