llvm · topperc · Feb 5, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 2, 2024
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14198,7 +14198,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
   // away, but using an AND rather than a right shift. HasShiftedOffset is used
   // to indicate that the narrowed load should be left-shifted ShAmt bits to get
   // the result.
-  bool HasShiftedOffset = false;
+  unsigned ShiftedOffset = 0;
   // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
   // extended to VT.
   if (Opc == ISD::SIGN_EXTEND_INREG) {
@@ -14243,7 +14243,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
     if (Mask.isMask()) {
       ActiveBits = Mask.countr_one();
     } else if (Mask.isShiftedMask(ShAmt, ActiveBits)) {
-      HasShiftedOffset = true;
+      ShiftedOffset = ShAmt;
     } else {
       return SDValue();
     }
@@ -14307,6 +14307,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
     SDNode *Mask = *(SRL->use_begin());
     if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
         isa<ConstantSDNode>(Mask->getOperand(1))) {
+      unsigned Offset, ActiveBits;
       const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
       if (ShiftMask.isMask()) {
         EVT MaskedVT =
@@ -14315,6 +14316,18 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
         if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
             TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
           ExtVT = MaskedVT;
+      } else if (ExtType == ISD::ZEXTLOAD &&
+                 ShiftMask.isShiftedMask(Offset, ActiveBits) &&
+                 (Offset + ShAmt) < VT.getScalarSizeInBits()) {
+        EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+        // If the mask is shifted we can use a narrower load and a shl to insert
+        // the trailing zeros.
+        if (((Offset + ActiveBits) <= ExtVT.getScalarSizeInBits()) &&
+            TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) {
+          ExtVT = MaskedVT;
+          ShAmt = Offset + ShAmt;
+          ShiftedOffset = Offset;
+        }
       }
     }
 
@@ -14400,12 +14413,12 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
                           Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
   }
 
-  if (HasShiftedOffset) {
+  if (ShiftedOffset != 0) {
     // We're using a shifted mask, so the load now has an offset. This means
     // that data has been loaded into the lower bytes than it would have been
     // before, so we need to shl the loaded data into the correct position in the
     // register.
-    SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
+    SDValue ShiftC = DAG.getConstant(ShiftedOffset, DL, VT);
     Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
   }

diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll
@@ -1278,3 +1278,115 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
   %r = or <4 x i32> %or.ab, %or.cd
   ret <4 x i32> %r
 }
+
+define arm_aapcscc i32 @test_shift15_and510(ptr nocapture %p) {
+; CHECK-ARM-LABEL: test_shift15_and510:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldrb r0, [r0, #2]
+; CHECK-ARM-NEXT:    lsl r0, r0, #1
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_shift15_and510:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrb r0, [r0, #1]
+; CHECK-BE-NEXT:    lsl r0, r0, #1
+; CHECK-BE-NEXT:    bx lr
+;
+; CHECK-THUMB-LABEL: test_shift15_and510:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    ldrb r0, [r0, #2]
+; CHECK-THUMB-NEXT:    lsls r0, r0, #1
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ALIGN-LABEL: test_shift15_and510:
+; CHECK-ALIGN:       @ %bb.0: @ %entry
+; CHECK-ALIGN-NEXT:    ldrb r0, [r0, #2]
+; CHECK-ALIGN-NEXT:    lsls r0, r0, #1
+; CHECK-ALIGN-NEXT:    bx lr
+;
+; CHECK-V6M-LABEL: test_shift15_and510:
+; CHECK-V6M:       @ %bb.0: @ %entry
+; CHECK-V6M-NEXT:    ldrb r0, [r0, #2]
+; CHECK-V6M-NEXT:    lsls r0, r0, #1
+; CHECK-V6M-NEXT:    bx lr
+entry:
+  %load = load i32, ptr %p, align 4
+  %lshr = lshr i32 %load, 15
+  %and = and i32 %lshr, 510
+  ret i32 %and
+}
+
+define arm_aapcscc i32 @test_shift22_and1020(ptr nocapture %p) {
+; CHECK-ARM-LABEL: test_shift22_and1020:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldrb r0, [r0, #3]
+; CHECK-ARM-NEXT:    lsl r0, r0, #2
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_shift22_and1020:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrb r0, [r0]
+; CHECK-BE-NEXT:    lsl r0, r0, #2
+; CHECK-BE-NEXT:    bx lr
+;
+; CHECK-THUMB-LABEL: test_shift22_and1020:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    ldrb r0, [r0, #3]
+; CHECK-THUMB-NEXT:    lsls r0, r0, #2
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ALIGN-LABEL: test_shift22_and1020:
+; CHECK-ALIGN:       @ %bb.0: @ %entry
+; CHECK-ALIGN-NEXT:    ldrb r0, [r0, #3]
+; CHECK-ALIGN-NEXT:    lsls r0, r0, #2
+; CHECK-ALIGN-NEXT:    bx lr
+;
+; CHECK-V6M-LABEL: test_shift22_and1020:
+; CHECK-V6M:       @ %bb.0: @ %entry
+; CHECK-V6M-NEXT:    ldrb r0, [r0, #3]
+; CHECK-V6M-NEXT:    lsls r0, r0, #2
+; CHECK-V6M-NEXT:    bx lr
+entry:
+  %load = load i32, ptr %p, align 4
+  %lshr = lshr i32 %load, 22
+  %and = and i32 %lshr, 1020
+  ret i32 %and
+}
+
+define arm_aapcscc i32 @test_zext_shift5_and2040(ptr nocapture %p) {
+; CHECK-ARM-LABEL: test_zext_shift5_and2040:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldrb r0, [r0, #1]
+; CHECK-ARM-NEXT:    lsl r0, r0, #3
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_zext_shift5_and2040:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrb r0, [r0]
+; CHECK-BE-NEXT:    lsl r0, r0, #3
+; CHECK-BE-NEXT:    bx lr
+;
+; CHECK-THUMB-LABEL: test_zext_shift5_and2040:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    ldrb r0, [r0, #1]
+; CHECK-THUMB-NEXT:    lsls r0, r0, #3
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ALIGN-LABEL: test_zext_shift5_and2040:
+; CHECK-ALIGN:       @ %bb.0: @ %entry
+; CHECK-ALIGN-NEXT:    ldrb r0, [r0, #1]
+; CHECK-ALIGN-NEXT:    lsls r0, r0, #3
+; CHECK-ALIGN-NEXT:    bx lr
+;
+; CHECK-V6M-LABEL: test_zext_shift5_and2040:
+; CHECK-V6M:       @ %bb.0: @ %entry
+; CHECK-V6M-NEXT:    ldrb r0, [r0, #1]
+; CHECK-V6M-NEXT:    lsls r0, r0, #3
+; CHECK-V6M-NEXT:    bx lr
+entry:
+  %load = load i16, ptr %p, align 2
+  %zext = zext i16 %load to i32
+  %lshr = lshr i32 %zext, 5
+  %and = and i32 %lshr, 2040
+  ret i32 %and
+}
diff --git a/llvm/test/CodeGen/X86/h-registers-2.ll b/llvm/test/CodeGen/X86/h-registers-2.ll
@@ -9,12 +9,14 @@ define i32 @foo(ptr %x, i32 %y) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    imull %eax, %eax
 ; CHECK-NEXT:    movzbl %ah, %eax
 ; CHECK-NEXT:    movb $77, (%ecx,%eax,8)
 ; CHECK-NEXT:    shll $3, %eax
 ; CHECK-NEXT:    retl
 
-	%t0 = lshr i32 %y, 8		; <i32> [#uses=1]
+	%t4 = mul i32 %y, %y
+	%t0 = lshr i32 %t4, 8		; <i32> [#uses=1]
 	%t1 = and i32 %t0, 255		; <i32> [#uses=2]
   %t2 = shl i32 %t1, 3
 	%t3 = getelementptr i8, ptr %x, i32 %t2		; <ptr> [#uses=1]