merge main into amd-staging

Jenkins · Jenkins · commit 10eefc43e40c · 2024-03-24T20:34:58.000Z
Change-Id: Ie5a156bd75f226b592ad6bb10ff9b853b55c4c34
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -3883,7 +3883,7 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
   int pos = 3 + highSignificandSize;
   mlir::Value index = builder.create<mlir::arith::AndIOp>(
       loc, builder.create<mlir::arith::ShRUIOp>(loc, intVal, signShift),
-      createIntegerConstant(1 << pos));
+      createIntegerConstant(1ULL << pos));
 
   // [e] exponent != 0
   mlir::Value exponent =
@@ -3895,7 +3895,7 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
           loc,
           builder.create<mlir::arith::CmpIOp>(
               loc, mlir::arith::CmpIPredicate::ne, exponent, zero),
-          createIntegerConstant(1 << --pos), zero));
+          createIntegerConstant(1ULL << --pos), zero));
 
   // [m] exponent == 1..1 (max exponent)
   index = builder.create<mlir::arith::OrIOp>(
@@ -3904,7 +3904,7 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
           loc,
           builder.create<mlir::arith::CmpIOp>(
               loc, mlir::arith::CmpIPredicate::eq, exponent, exponentMask),
-          createIntegerConstant(1 << --pos), zero));
+          createIntegerConstant(1ULL << --pos), zero));
 
   // [l] low-order significand != 0
   index = builder.create<mlir::arith::OrIOp>(
@@ -3916,7 +3916,7 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
               builder.create<mlir::arith::AndIOp>(loc, intVal,
                                                   lowSignificandMask),
               zero),
-          createIntegerConstant(1 << --pos), zero));
+          createIntegerConstant(1ULL << --pos), zero));
 
   // [h] high-order significand (1 or 2 bits)
   index = builder.create<mlir::arith::OrIOp>(
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
@@ -328,9 +328,10 @@ bool X86_64::relaxOnce(int pass) const {
         if (rel.expr != R_RELAX_GOT_PC)
           continue;
 
-        uint64_t v = sec->getRelocTargetVA(
-            sec->file, rel.type, rel.addend,
-            sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr);
+        uint64_t v = sec->getRelocTargetVA(sec->file, rel.type, rel.addend,
+                                           sec->getOutputSection()->addr +
+                                               sec->outSecOff + rel.offset,
+                                           *rel.sym, rel.expr);
         if (isInt<32>(v))
           continue;
         if (rel.sym->auxIdx == 0) {
diff --git a/lld/test/ELF/x86-64-gotpc-relax-too-far.s b/lld/test/ELF/x86-64-gotpc-relax-too-far.s
@@ -5,7 +5,10 @@
 # RUN: llvm-objdump --no-print-imm-hex -d %t/bin | FileCheck --check-prefix=DISASM %s
 # RUN: llvm-readelf -S %t/bin | FileCheck --check-prefixes=GOT %s
 # RUN: ld.lld -T %t/lds2 %t/a.o -o %t/bin2
-# RUN: llvm-readelf -S %t/bin2 | FileCheck --check-prefixes=UNNECESSARY-GOT %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t/bin2 | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readelf -S %t/bin2 | FileCheck --check-prefixes=GOT %s
+# RUN: ld.lld -T %t/lds3 %t/a.o -o %t/bin3
+# RUN: llvm-readelf -S %t/bin3 | FileCheck --check-prefixes=UNNECESSARY-GOT %s
 
 # DISASM:      <_foo>:
 # DISASM-NEXT: movl    2097146(%rip), %eax
@@ -47,6 +50,13 @@ SECTIONS {
   data 0x80200000 : { *(data) }
 }
 #--- lds2
+SECTIONS {
+  .text.foo 0x100000 : { *(.text.foo) }
+  .text 0x1ff000 : { . = . + 0x1000 ; *(.text) }
+  .got 0x300000 : { *(.got) }
+  data 0x80200000 : { *(data) }
+}
+#--- lds3
 SECTIONS {
   .text.foo 0x100000 : { *(.text.foo) }
   .text 0x200000 : { *(.text) }
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
@@ -1058,6 +1058,17 @@ class CmpInst : public Instruction {
   static CmpInst *Create(OtherOps Op, Predicate predicate, Value *S1,
                          Value *S2, const Twine &Name, BasicBlock *InsertAtEnd);
 
+  /// Construct a compare instruction, given the opcode, the predicate,
+  /// the two operands and the instruction to copy the flags from. Optionally
+  /// (if InstBefore is specified) insert the instruction into a BasicBlock
+  /// right before the specified instruction. The specified Instruction is
+  /// allowed to be a dereferenced end iterator. Create a CmpInst
+  static CmpInst *CreateWithCopiedFlags(OtherOps Op, Predicate Pred, Value *S1,
+                                        Value *S2,
+                                        const Instruction *FlagsSource,
+                                        const Twine &Name = "",
+                                        Instruction *InsertBefore = nullptr);
+
   /// Get the opcode casted to the right type
   OtherOps getOpcode() const {
     return static_cast<OtherOps>(Instruction::getOpcode());
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2529,20 +2529,28 @@ static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
 }
 
-// Attempt to form avgceilu(A, B) from (A | B) - ((A ^ B) >> 1)
-static SDValue combineFixedwidthToAVGCEILU(SDNode *N, SelectionDAG &DAG) {
+// Attempt to form avgceil(A, B) from (A | B) - ((A ^ B) >> 1)
+static SDValue combineFixedwidthToAVGCEIL(SDNode *N, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue N0 = N->getOperand(0);
   EVT VT = N0.getValueType();
   SDLoc DL(N);
+  SDValue A, B;
+
   if (TLI.isOperationLegal(ISD::AVGCEILU, VT)) {
-    SDValue A, B;
     if (sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
                           m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
                                 m_SpecificInt(1))))) {
       return DAG.getNode(ISD::AVGCEILU, DL, VT, A, B);
     }
   }
+  if (TLI.isOperationLegal(ISD::AVGCEILS, VT)) {
+    if (sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
+                          m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)),
+                                m_SpecificInt(1))))) {
+      return DAG.getNode(ISD::AVGCEILS, DL, VT, A, B);
+    }
+  }
   return SDValue();
 }
 
@@ -2837,20 +2845,29 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   return SDValue();
 }
 
-// Attempt to form avgflooru(A, B) from (A & B) + ((A ^ B) >> 1)
-static SDValue combineFixedwidthToAVGFLOORU(SDNode *N, SelectionDAG &DAG) {
+// Attempt to form avgfloor(A, B) from (A & B) + ((A ^ B) >> 1)
+static SDValue combineFixedwidthToAVGFLOOR(SDNode *N, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue N0 = N->getOperand(0);
   EVT VT = N0.getValueType();
   SDLoc DL(N);
+  SDValue A, B;
+
   if (TLI.isOperationLegal(ISD::AVGFLOORU, VT)) {
-    SDValue A, B;
     if (sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
                           m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
                                 m_SpecificInt(1))))) {
       return DAG.getNode(ISD::AVGFLOORU, DL, VT, A, B);
     }
   }
+  if (TLI.isOperationLegal(ISD::AVGFLOORS, VT)) {
+    if (sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
+                          m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)),
+                                m_SpecificInt(1))))) {
+      return DAG.getNode(ISD::AVGFLOORS, DL, VT, A, B);
+    }
+  }
+
   return SDValue();
 }
 
@@ -2869,8 +2886,8 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
-  // Try to match AVGFLOORU fixedwidth pattern
-  if (SDValue V = combineFixedwidthToAVGFLOORU(N, DAG))
+  // Try to match AVGFLOOR fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVGFLOOR(N, DAG))
     return V;
 
   // fold (a+b) -> (a|b) iff a and b share no bits.
@@ -3868,8 +3885,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
-  // Try to match AVGCEILU fixedwidth pattern
-  if (SDValue V = combineFixedwidthToAVGCEILU(N, DAG))
+  // Try to match AVGCEIL fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVGCEIL(N, DAG))
     return V;
 
   if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
@@ -4623,6 +4623,16 @@ CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
                       S1, S2, Name);
 }
 
+CmpInst *CmpInst::CreateWithCopiedFlags(OtherOps Op, Predicate Pred, Value *S1,
+                                        Value *S2,
+                                        const Instruction *FlagsSource,
+                                        const Twine &Name,
+                                        Instruction *InsertBefore) {
+  CmpInst *Inst = Create(Op, Pred, S1, S2, Name, InsertBefore);
+  Inst->copyIRFlags(FlagsSource);
+  return Inst;
+}
+
 void CmpInst::swapOperands() {
   if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
     IC->swapOperands();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -487,7 +487,9 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
     // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
     Value *E0 = Builder.CreateExtractElement(X, Index);
     Value *E1 = Builder.CreateExtractElement(Y, Index);
-    return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1);
+    CmpInst *SrcCmpInst = cast<CmpInst>(SrcVec);
+    return CmpInst::CreateWithCopiedFlags(SrcCmpInst->getOpcode(), Pred, E0, E1,
+                                          SrcCmpInst);
   }
 
   if (auto *I = dyn_cast<Instruction>(SrcVec)) {
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -341,6 +341,18 @@ define <8 x i16> @sub_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
   ret <8 x i16> %res
 }
 
+define <8 x i16> @srhadd_fixedwidth_v8i16(<8 x i16> %a0, <8 x i16> %a1)  {
+; CHECK-LABEL: srhadd_fixedwidth_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %or = or <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a0, %a1
+  %srl = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <8 x i16> %or, %srl
+  ret <8 x i16> %res
+}
+
 define <8 x i16> @rhaddu_base(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: rhaddu_base:
 ; CHECK:       // %bb.0:
@@ -879,6 +891,18 @@ define <8 x i16> @uhadd_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
   ret <8 x i16> %res
 }
 
+define <8 x i16> @shadd_fixedwidth_v8i16(<8 x i16> %a0, <8 x i16> %a1)  {
+; CHECK-LABEL: shadd_fixedwidth_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %and = and <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a0, %a1
+  %srl = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <8 x i16> %and, %srl
+  ret <8 x i16> %res
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i1 @load_bv_v4i8(i1 zeroext %a) {
 ; CHECK-LABEL: load_bv_v4i8:
@@ -11,18 +12,31 @@ define i1 @load_bv_v4i8(i1 zeroext %a) {
 }
 
 define noundef i1 @logger(i32 noundef %logLevel, ptr %ea, ptr %pll) {
-; CHECK-LABEL: logger:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr w8, [x2]
-; CHECK-NEXT:    cmp w8, w0
-; CHECK-NEXT:    b.ls .LBB1_2
-; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_2: // %land.rhs
-; CHECK-NEXT:    ldr x8, [x1]
-; CHECK-NEXT:    ldrb w0, [x8]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: logger:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr w8, [x2]
+; CHECK-SD-NEXT:    cmp w8, w0
+; CHECK-SD-NEXT:    b.ls .LBB1_2
+; CHECK-SD-NEXT:  // %bb.1:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB1_2: // %land.rhs
+; CHECK-SD-NEXT:    ldr x8, [x1]
+; CHECK-SD-NEXT:    ldrb w0, [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: logger:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr w8, [x2]
+; CHECK-GI-NEXT:    cmp w8, w0
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.hi .LBB1_2
+; CHECK-GI-NEXT:  // %bb.1: // %land.rhs
+; CHECK-GI-NEXT:    ldr x8, [x1]
+; CHECK-GI-NEXT:    ldrb w8, [x8]
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:  .LBB1_2: // %land.end
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = load i32, ptr %pll, align 4
   %cmp.not = icmp ugt i32 %0, %logLevel
@@ -44,30 +58,51 @@ land.end:                                         ; preds = %land.rhs, %entry
 
 declare i64 @llvm.ctlz.i64(i64 %in, i1)
 define i1 @lshr_ctlz_undef_cmpeq_one_i64(i64 %in) {
-; CHECK-LABEL: lshr_ctlz_undef_cmpeq_one_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    clz x8, x0
-; CHECK-NEXT:    lsr x0, x8, #6
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_ctlz_undef_cmpeq_one_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    clz x8, x0
+; CHECK-SD-NEXT:    lsr x0, x8, #6
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_ctlz_undef_cmpeq_one_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    clz x8, x0
+; CHECK-GI-NEXT:    lsr x8, x8, #6
+; CHECK-GI-NEXT:    cmp x8, #1
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 -1)
   %lshr = lshr i64 %ctlz, 6
   %icmp = icmp eq i64 %lshr, 1
   ret i1 %icmp
 }
 
 define i32 @PR17487(i1 %tobool) {
-; CHECK-LABEL: PR17487:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v0.2s, w0
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: PR17487:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v0.2s, w0
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    dup v1.2d, x8
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    cmp x8, #1
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: PR17487:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    mov v0.d[1], x0
+; CHECK-GI-NEXT:    adrp x8, .LCPI3_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-GI-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    cmp x8, #1
+; CHECK-GI-NEXT:    cset w0, ne
+; CHECK-GI-NEXT:    ret
   %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1
   %tmp1 = zext <2 x i1> %tmp to <2 x i64>
   %tmp2 = xor <2 x i64> %tmp1, <i64 1, i64 1>
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
diff --git a/llvm/test/Transforms/InstCombine/scalarization.ll b/llvm/test/Transforms/InstCombine/scalarization.ll