GPUOpen-Drivers
diff --git a/‎clang/lib/CodeGen/CGBuiltin.cpp
Lines changed: 10 additions & 2 deletions b/‎clang/lib/CodeGen/CGBuiltin.cpp
Lines changed: 10 additions & 2 deletions
diff --git a/‎clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
Lines changed: 2 additions & 1 deletion b/‎clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
Lines changed: 2 additions & 1 deletion
diff --git a/‎flang/lib/Lower/ConvertVariable.cpp
Lines changed: 5 additions & 0 deletions b/‎flang/lib/Lower/ConvertVariable.cpp
Lines changed: 5 additions & 0 deletions
diff --git a/‎flang/lib/Lower/IO.cpp
Lines changed: 4 additions & 4 deletions b/‎flang/lib/Lower/IO.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎flang/test/Lower/derived-type-finalization.f90
Lines changed: 1 addition & 14 deletions b/‎flang/test/Lower/derived-type-finalization.f90
Lines changed: 1 addition & 14 deletions
diff --git a/‎flang/test/Lower/io-derived-type-2.f90
Lines changed: 70 additions & 0 deletions b/‎flang/test/Lower/io-derived-type-2.f90
Lines changed: 70 additions & 0 deletions
diff --git a/‎flang/test/Lower/polymorphic.f90
Lines changed: 1 addition & 1 deletion b/‎flang/test/Lower/polymorphic.f90
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/CODE_OWNERS.TXT
Lines changed: 0 additions & 4 deletions b/‎llvm/CODE_OWNERS.TXT
Lines changed: 0 additions & 4 deletions
diff --git a/‎llvm/docs/LangRef.rst
Lines changed: 10 additions & 10 deletions b/‎llvm/docs/LangRef.rst
Lines changed: 10 additions & 10 deletions
diff --git a/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/include/llvm/IR/Type.h
Lines changed: 1 addition & 2 deletions b/‎llvm/include/llvm/IR/Type.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎llvm/lib/Analysis/InstructionSimplify.cpp
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/Analysis/InstructionSimplify.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/IR/Operator.cpp
Lines changed: 2 additions & 4 deletions b/‎llvm/lib/IR/Operator.cpp
Lines changed: 2 additions & 4 deletions
diff --git a/‎llvm/lib/IR/Type.cpp
Lines changed: 3 additions & 2 deletions b/‎llvm/lib/IR/Type.cpp
Lines changed: 3 additions & 2 deletions
diff --git a/‎llvm/lib/IR/Verifier.cpp
Lines changed: 3 additions & 11 deletions b/‎llvm/lib/IR/Verifier.cpp
Lines changed: 3 additions & 11 deletions
diff --git a/‎llvm/lib/Target/AArch64/AArch64.td
Lines changed: 18 additions & 2 deletions b/‎llvm/lib/Target/AArch64/AArch64.td
Lines changed: 18 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
Lines changed: 33 additions & 0 deletions b/‎llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
Lines changed: 33 additions & 0 deletions
@@ -17392,14 +17392,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_log_clampf:
     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
   case AMDGPU::BI__builtin_amdgcn_ldexp:
-  case AMDGPU::BI__builtin_amdgcn_ldexpf:
-  case AMDGPU::BI__builtin_amdgcn_ldexph: {
+  case AMDGPU::BI__builtin_amdgcn_ldexpf: {
     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
     llvm::Function *F =
         CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
     return Builder.CreateCall(F, {Src0, Src1});
   }
+  case AMDGPU::BI__builtin_amdgcn_ldexph: {
+    // The raw instruction has a different behavior for out of bounds exponent
+    // values (implicit truncation instead of saturate to short_min/short_max).
+    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+    llvm::Function *F =
+        CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
+    return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
+  }
   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
 
@@ -52,7 +52,8 @@ void test_cos_f16(global half* out, half a)
 }
 
 // CHECK-LABEL: @test_ldexp_f16
-// CHECK: call half @llvm.ldexp.f16.i32
+// CHECK: [[TRUNC:%[0-9a-z]+]] = trunc i32
+// CHECK: call half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]])
 void test_ldexp_f16(global half* out, half a, int b)
 {
   *out = __builtin_amdgcn_ldexph(a, b);
 
@@ -660,6 +660,11 @@ static bool needEndFinalization(const Fortran::lower::pft::Variable &var) {
   if (!var.hasSymbol())
     return false;
   const Fortran::semantics::Symbol &sym = var.getSymbol();
+  const Fortran::semantics::Scope &owner = sym.owner();
+  if (owner.kind() == Fortran::semantics::Scope::Kind::MainProgram) {
+    // The standard does not require finalizing main program variables.
+    return false;
+  }
   if (!Fortran::semantics::IsPointer(sym) &&
       !Fortran::semantics::IsAllocatable(sym) &&
       !Fortran::semantics::IsDummy(sym) &&
 
@@ -655,7 +655,7 @@ static void genNamelistIO(Fortran::lower::AbstractConverter &converter,
 static mlir::func::FuncOp getOutputFunc(mlir::Location loc,
                                         fir::FirOpBuilder &builder,
                                         mlir::Type type, bool isFormatted) {
-  if (type.isa<fir::RecordType>())
+  if (fir::unwrapPassByRefType(type).isa<fir::RecordType>())
     return getIORuntimeFunc<mkIOKey(OutputDerivedType)>(loc, builder);
   if (!isFormatted)
     return getIORuntimeFunc<mkIOKey(OutputDescriptor)>(loc, builder);
@@ -737,7 +737,7 @@ static void genOutputItemList(
     if (argType.isa<fir::BoxType>()) {
       mlir::Value box = fir::getBase(converter.genExprBox(loc, *expr, stmtCtx));
       outputFuncArgs.push_back(builder.createConvert(loc, argType, box));
-      if (itemTy.isa<fir::RecordType>())
+      if (fir::unwrapPassByRefType(itemTy).isa<fir::RecordType>())
         outputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter));
     } else if (helper.isCharacterScalar(itemTy)) {
       fir::ExtendedValue exv = converter.genExprAddr(loc, expr, stmtCtx);
@@ -772,7 +772,7 @@ static void genOutputItemList(
 static mlir::func::FuncOp getInputFunc(mlir::Location loc,
                                        fir::FirOpBuilder &builder,
                                        mlir::Type type, bool isFormatted) {
-  if (type.isa<fir::RecordType>())
+  if (fir::unwrapPassByRefType(type).isa<fir::RecordType>())
     return getIORuntimeFunc<mkIOKey(InputDerivedType)>(loc, builder);
   if (!isFormatted)
     return getIORuntimeFunc<mkIOKey(InputDescriptor)>(loc, builder);
@@ -834,7 +834,7 @@ createIoRuntimeCallForItem(Fortran::lower::AbstractConverter &converter,
     auto boxTy = box.getType().dyn_cast<fir::BaseBoxType>();
     assert(boxTy && "must be previously emboxed");
     inputFuncArgs.push_back(builder.createConvert(loc, argType, box));
-    if (boxTy.getEleTy().isa<fir::RecordType>())
+    if (fir::unwrapPassByRefType(boxTy).isa<fir::RecordType>())
       inputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter));
   } else {
     mlir::Value itemAddr = fir::getBase(item);
 
@@ -248,20 +248,7 @@ subroutine local_t4()
 program p
   use derived_type_finalization
   type(t1) :: t
-  if (t%a == 10) return
-  print *, 'end of program'
 end program
 
 ! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "p"} {
-! CHECK: %[[T:.*]] = fir.alloca !fir.type<_QMderived_type_finalizationTt1{a:i32}> {bindc_name = "t", uniq_name = "_QFEt"}
-! CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2
-! CHECK: ^bb1:
-! CHECK:  %[[EMBOX:.*]] = fir.embox %[[T]] : (!fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
-! CHECK:  %[[BOX_NONE:.*]] = fir.convert %[[EMBOX]] : (!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<none>
-! CHECK:  %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none
-! CHECK:  return
-! CHECK: ^bb2:
-! CHECK:  %[[EMBOX:.*]] = fir.embox %[[T]] : (!fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
-! CHECK:  %[[BOX_NONE:.*]] = fir.convert %[[EMBOX]] : (!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<none>
-! CHECK:  %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none
-! CHECK:  return
+! CHECK-NOT: fir.call @_FortranADestroy
@@ -0,0 +1,70 @@
+! Check that InputDerivedType/OutputDeriverType APIs are used
+! for io of derived types.
+! RUN: bbc -polymorphic-type -emit-fir -o - %s | FileCheck %s
+
+module p
+  type :: person
+     type(person), pointer :: next => null()
+  end type person
+  type :: club
+     class(person), allocatable :: membership(:)
+  end type club
+contains
+  subroutine pwf (dtv,unit,iotype,vlist,iostat,iomsg)
+    class(person), intent(in) :: dtv
+    integer, intent(in) :: unit
+    character (len=*), intent(in) :: iotype
+    integer, intent(in) :: vlist(:)
+    integer, intent(out) :: iostat
+    character (len=*), intent(inout) :: iomsg
+    print *, 'write'
+  end subroutine pwf
+  subroutine prf (dtv,unit,iotype,vlist,iostat,iomsg)
+    class(person), intent(inout) :: dtv
+    integer, intent(in) :: unit
+    character (len=*), intent(in) :: iotype
+    integer, intent(in) :: vlist(:)
+    integer, intent(out) :: iostat
+    character (len=*), intent(inout) :: iomsg
+  end subroutine prf
+  subroutine test1(dtv)
+    interface read(formatted)
+       module procedure prf
+    end interface read(formatted)
+    class(person), intent(inout) :: dtv
+    read(7, fmt='(DT)') dtv%next
+  end subroutine test1
+! CHECK-LABEL:   func.func @_QMpPtest1(
+! CHECK:           %{{.*}} = fir.call @_FortranAioInputDerivedType(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
+
+  subroutine test2(social_club)
+    interface read(formatted)
+       module procedure prf
+    end interface read(formatted)
+    class(club) :: social_club
+    read(7, fmt='(DT)') social_club%membership(0)
+  end subroutine test2
+! CHECK-LABEL:   func.func @_QMpPtest2(
+! CHECK:           %{{.*}} = fir.call @_FortranAioInputDerivedType(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
+
+  subroutine test3(dtv)
+    interface write(formatted)
+       module procedure pwf
+    end interface write(formatted)
+    class(person), intent(inout) :: dtv
+    write(7, fmt='(DT)') dtv%next
+  end subroutine test3
+! CHECK-LABEL:   func.func @_QMpPtest3(
+! CHECK:           %{{.*}} = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
+
+  subroutine test4(social_club)
+    interface write(formatted)
+       module procedure pwf
+    end interface write(formatted)
+    class(club) :: social_club
+    write(7, fmt='(DT)') social_club%membership(0)
+  end subroutine test4
+! CHECK-LABEL:   func.func @_QMpPtest4(
+! CHECK:           %{{.*}} = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
+end module p
+
@@ -766,7 +766,7 @@ subroutine test_polymorphic_io()
 ! CHECK: %[[P:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> {bindc_name = "p", uniq_name = "_QMpolymorphic_testFtest_polymorphic_ioEp"}
 ! CHECK: %[[LOAD_P:.*]] = fir.load %[[P]] : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[LOAD_P]] : (!fir.class<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAioInputDescriptor(%{{.*}}, %[[BOX_NONE]]) {{.*}} : (!fir.ref<i8>, !fir.box<none>) -> i1
+! CHECK: %{{.*}} = fir.call @_FortranAioInputDerivedType(%{{.*}}, %[[BOX_NONE]], %{{.*}}) {{.*}} : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
 
   function unlimited_polymorphic_alloc_array_ret()
     class(*), allocatable :: unlimited_polymorphic_alloc_array_ret(:)
 
@@ -75,10 +75,6 @@ E: [email protected]
 E: [email protected]
 D: MCA, llvm-mca
 
-N: Duncan P. N. Exon Smith
-E: [email protected]
-D: Branch weights and BlockFrequencyInfo
-
 N: Hal Finkel
 E: [email protected]
 D: The loop reroller and alias analysis
 
@@ -742,16 +742,16 @@ an optional list of attached :ref:`metadata <metadata>`.
 Variables and aliases can have a
 :ref:`Thread Local Storage Model <tls_model>`.
 
-:ref:`Scalable vectors <t_vector>` cannot be global variables or members of
-arrays because their size is unknown at compile time. They are allowed in
-structs to facilitate intrinsics returning multiple values. Generally, structs
-containing scalable vectors are not considered "sized" and cannot be used in
-loads, stores, allocas, or GEPs. The only exception to this rule is for structs
-that contain scalable vectors of the same type (e.g. ``{<vscale x 2 x i32>,
-<vscale x 2 x i32>}`` contains the same type while ``{<vscale x 2 x i32>,
-<vscale x 2 x i64>}`` doesn't). These kinds of structs (we may call them
-homogeneous scalable vector structs) are considered sized and can be used in
-loads, stores, allocas, but not GEPs.
+Globals cannot be or contain :ref:`Scalable vectors <t_vector>` because their
+size is unknown at compile time. They are allowed in structs to facilitate
+intrinsics returning multiple values. Generally, structs containing scalable
+vectors are not considered "sized" and cannot be used in loads, stores, allocas,
+or GEPs. The only exception to this rule is for structs that contain scalable
+vectors of the same type (e.g. ``{<vscale x 2 x i32>, <vscale x 2 x i32>}``
+contains the same type while ``{<vscale x 2 x i32>, <vscale x 2 x i64>}``
+doesn't). These kinds of structs (we may call them homogeneous scalable vector
+structs) are considered sized and can be used in loads, stores, allocas, but
+not GEPs.
 
 Syntax::
 
 
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 474717
+#define LLVM_MAIN_REVISION 474729
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
 
@@ -209,8 +209,7 @@ class Type {
   /// Return true if this is a target extension type with a scalable layout.
   bool isScalableTargetExtTy() const;
 
-  /// Return true if this is a scalable vector type or a target extension type
-  /// with a scalable layout.
+  /// Return true if this is a type whose size is a known multiple of vscale.
   bool isScalableTy() const;
 
   /// Return true if this is a FP type or a vector of FP.
 
@@ -4934,7 +4934,7 @@ static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr,
     return UndefValue::get(GEPTy);
 
   bool IsScalableVec =
-      isa<ScalableVectorType>(SrcTy) || any_of(Indices, [](const Value *V) {
+      SrcTy->isScalableTy() || any_of(Indices, [](const Value *V) {
         return isa<ScalableVectorType>(V->getType());
       });
 
 
@@ -127,9 +127,7 @@ bool GEPOperator::accumulateConstantOffset(
   auto end = generic_gep_type_iterator<decltype(Index.end())>::end(Index.end());
   for (auto GTI = begin, GTE = end; GTI != GTE; ++GTI) {
     // Scalable vectors are multiplied by a runtime constant.
-    bool ScalableType = false;
-    if (isa<ScalableVectorType>(GTI.getIndexedType()))
-      ScalableType = true;
+    bool ScalableType = GTI.getIndexedType()->isScalableTy();
 
     Value *V = GTI.getOperand();
     StructType *STy = GTI.getStructTypeOrNull();
@@ -189,7 +187,7 @@ bool GEPOperator::collectOffset(
   for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
        GTI != GTE; ++GTI) {
     // Scalable vectors are multiplied by a runtime constant.
-    bool ScalableType = isa<ScalableVectorType>(GTI.getIndexedType());
+    bool ScalableType = GTI.getIndexedType()->isScalableTy();
 
     Value *V = GTI.getOperand();
     StructType *STy = GTI.getStructTypeOrNull();
 
@@ -58,6 +58,8 @@ bool Type::isIntegerTy(unsigned Bitwidth) const {
 }
 
 bool Type::isScalableTy() const {
+  if (const auto *ATy = dyn_cast<ArrayType>(this))
+    return ATy->getElementType()->isScalableTy();
   if (const auto *STy = dyn_cast<StructType>(this)) {
     SmallPtrSet<Type *, 4> Visited;
     return STy->containsScalableVectorType(&Visited);
@@ -658,8 +660,7 @@ ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) {
 bool ArrayType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
-         !ElemTy->isTokenTy() && !ElemTy->isX86_AMXTy() &&
-         !isa<ScalableVectorType>(ElemTy);
+         !ElemTy->isTokenTy() && !ElemTy->isX86_AMXTy();
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -850,17 +850,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   }
 
   // Scalable vectors cannot be global variables, since we don't know
-  // the runtime size. If the global is an array containing scalable vectors,
-  // that will be caught by the isValidElementType methods in StructType or
-  // ArrayType instead.
-  Check(!isa<ScalableVectorType>(GV.getValueType()),
-        "Globals cannot contain scalable vectors", &GV);
-
-  if (auto *STy = dyn_cast<StructType>(GV.getValueType())) {
-    SmallPtrSet<Type *, 4> Visited;
-    Check(!STy->containsScalableVectorType(&Visited),
-          "Globals cannot contain scalable vectors", &GV);
-  }
+  // the runtime size.
+  Check(!GV.getValueType()->isScalableTy(),
+        "Globals cannot contain scalable types", &GV);
 
   // Check if it's a target extension type that disallows being used as a
   // global.
 
@@ -570,6 +570,18 @@ def FeatureD128 : SubtargetFeature<"d128", "HasD128",
     "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
     [FeatureLSE128]>;
 
+def FeatureDisableLdp : SubtargetFeature<"disable-ldp", "HasDisableLdp",
+    "true", "Do not emit ldp">;
+
+def FeatureDisableStp : SubtargetFeature<"disable-stp", "HasDisableStp",
+    "true", "Do not emit stp">;
+
+def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedOnly",
+    "true", "In order to emit ldp, first check if the load will be aligned to 2 * element_size">;
+
+def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
+    "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -1239,7 +1251,9 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    FeatureArithmeticBccFusion,
                                    FeatureCmpBccFusion,
                                    FeatureFuseAddress,
-                                   FeatureFuseLiterals]>;
+                                   FeatureFuseLiterals,
+			           FeatureLdpAlignedOnly,
+                                   FeatureStpAlignedOnly]>;
 
 def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     "Ampere Computing Ampere-1A processors", [
@@ -1252,7 +1266,9 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureCmpBccFusion,
                                     FeatureFuseAddress,
                                     FeatureFuseLiterals,
-                                    FeatureFuseLiterals]>;
+                                    FeatureFuseLiterals,
+                                    FeatureLdpAlignedOnly,
+                                    FeatureStpAlignedOnly]>;
 
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
 
@@ -2136,6 +2136,14 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
   if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
+  // If disable-ldp feature is opted, do not emit ldp.
+  if (MI.mayLoad() && Subtarget->hasDisableLdp())
+    return false;
+
+  // If disable-stp feature is opted, do not emit stp.
+  if (MI.mayStore() && Subtarget->hasDisableStp())
+    return false;
+
   // Early exit if the offset is not possible to match. (6 bits of positive
   // range, plus allow an extra one in case we find a later insn that matches
   // with Offset-1)
@@ -2159,6 +2167,31 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
     // Keeping the iterator straight is a pain, so we let the merge routine tell
     // us what the next instruction is after it's done mucking about.
     auto Prev = std::prev(MBBI);
+
+    // Fetch the memoperand of the load/store that is a candidate for
+    // combination.
+    MachineMemOperand *MemOp =
+        MI.memoperands_empty() ? nullptr : MI.memoperands().front();
+
+    // Get the needed alignments to check them if
+    // ldp-aligned-only/stp-aligned-only features are opted.
+    uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1;
+    uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1;
+
+    // If a load arrives and ldp-aligned-only feature is opted, check that the
+    // alignment of the source pointer is at least double the alignment of the
+    // type.
+    if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp &&
+        MemAlignment < 2 * TypeAlignment)
+      return false;
+
+    // If a store arrives and stp-aligned-only feature is opted, check that the
+    // alignment of the source pointer is at least double the alignment of the
+    // type.
+    if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp &&
+        MemAlignment < 2 * TypeAlignment)
+      return false;
+
     MBBI = mergePairedInsns(MBBI, Paired, Flags);
     // Collect liveness info for instructions between Prev and the new position
     // MBBI.
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,8 @@ void test_cos_f16(global half* out, half a)`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`// CHECK-LABEL: @test_ldexp_f16`
`55`		`-// CHECK: call half @llvm.ldexp.f16.i32`
	`55`	`+// CHECK: [[TRUNC:%[0-9a-z]+]] = trunc i32`
	`56`	`+// CHECK: call half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]])`
`56`	`57`	`void test_ldexp_f16(global half* out, half a, int b)`
`57`	`58`	`{`
`58`	`59`	`*out = __builtin_amdgcn_ldexph(a, b);`