[Matrix] Emit assumption that matrix indices are valid.

fhahn · fhahn · commit ea21d688dc0a · 2021-09-22T12:27:37.000+01:00
The matrix extension requires the indices for matrix subscript expression to be valid and it is UB otherwise. extract/insertelement produce poison if the index is invalid, which limits the optimizer to not be bale to scalarize load/extract pairs for example, which causes very suboptimal code to be generated when using matrix subscript expressions with variable indices for large matrixes. This patch updates IRGen to emit assumes to for index expression to convey the information that the index must be valid. This also adjusts the order in which operations are emitted slightly, so indices & assumes are added before the load of the matrix value. Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D102478
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
@@ -1939,10 +1940,15 @@ RValue CodeGenFunction::EmitLoadOfLValue(LValue LV, SourceLocation Loc) {
     return EmitLoadOfGlobalRegLValue(LV);
 
   if (LV.isMatrixElt()) {
+    llvm::Value *Idx = LV.getMatrixIdx();
+    if (CGM.getCodeGenOpts().OptimizationLevel > 0) {
+      const auto *const MatTy = LV.getType()->getAs<ConstantMatrixType>();
+      llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+      MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened());
+    }
     llvm::LoadInst *Load =
         Builder.CreateLoad(LV.getMatrixAddress(), LV.isVolatileQualified());
-    return RValue::get(
-        Builder.CreateExtractElement(Load, LV.getMatrixIdx(), "matrixext"));
+    return RValue::get(Builder.CreateExtractElement(Load, Idx, "matrixext"));
   }
 
   assert(LV.isBitField() && "Unknown LValue type!");
@@ -2080,9 +2086,15 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst,
       return EmitStoreThroughGlobalRegLValue(Src, Dst);
 
     if (Dst.isMatrixElt()) {
-      llvm::Value *Vec = Builder.CreateLoad(Dst.getMatrixAddress());
-      Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(),
-                                        Dst.getMatrixIdx(), "matins");
+      llvm::Value *Idx = Dst.getMatrixIdx();
+      if (CGM.getCodeGenOpts().OptimizationLevel > 0) {
+        const auto *const MatTy = Dst.getType()->getAs<ConstantMatrixType>();
+        llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+        MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened());
+      }
+      llvm::Instruction *Load = Builder.CreateLoad(Dst.getMatrixAddress());
+      llvm::Value *Vec =
+          Builder.CreateInsertElement(Load, Src.getScalarVal(), Idx, "matins");
       Builder.CreateStore(Vec, Dst.getMatrixAddress(),
                           Dst.isVolatileQualified());
       return;
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1775,13 +1775,18 @@ Value *ScalarExprEmitter::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *E) {
   // integer value.
   Value *RowIdx = Visit(E->getRowIdx());
   Value *ColumnIdx = Visit(E->getColumnIdx());
+
+  const auto *MatrixTy = E->getBase()->getType()->castAs<ConstantMatrixType>();
+  unsigned NumRows = MatrixTy->getNumRows();
+  llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+  Value *Idx = MB.CreateIndex(RowIdx, ColumnIdx, NumRows);
+  if (CGF.CGM.getCodeGenOpts().OptimizationLevel > 0)
+    MB.CreateIndexAssumption(Idx, MatrixTy->getNumElementsFlattened());
+
   Value *Matrix = Visit(E->getBase());
 
   // TODO: Should we emit bounds checks with SanitizerKind::ArrayBounds?
-  llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
-  return MB.CreateExtractElement(
-      Matrix, RowIdx, ColumnIdx,
-      E->getBase()->getType()->castAs<ConstantMatrixType>()->getNumRows());
+  return Builder.CreateExtractElement(Matrix, Idx, "matrixext");
 }
 
 static int getMaskElt(llvm::ShuffleVectorInst *SVI, unsigned Idx,
diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK %s
+// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK,OPT %s
 
 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
 typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
@@ -506,7 +507,7 @@ void multiply_matrix_matrix_double(dx5x5_t b, dx5x5_t c) {
   // CHECK-NEXT:    [[RES:%.*]] = call <25 x double> @llvm.matrix.multiply.v25f64.v25f64.v25f64(<25 x double> [[B]], <25 x double> [[C]], i32 5, i32 5, i32 5)
   // CHECK-NEXT:    [[A_ADDR:%.*]] = bitcast [25 x double]* %a to <25 x double>*
   // CHECK-NEXT:    store <25 x double> [[RES]], <25 x double>* [[A_ADDR]], align 8
-  // CHECK-NEXT:    ret void
+  // CHECK:         ret void
   //
 
   dx5x5_t a;
@@ -531,7 +532,7 @@ typedef int ix9x9_t __attribute__((matrix_type(9, 9)));
 // CHECK-NEXT:    [[RES:%.*]] = call <81 x i32> @llvm.matrix.multiply.v81i32.v27i32.v27i32(<27 x i32> [[B]], <27 x i32> [[C]], i32 9, i32 3, i32 9)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = bitcast [81 x i32]* %a to <81 x i32>*
 // CHECK-NEXT:    store <81 x i32> [[RES]], <81 x i32>* [[A_ADDR]], align 4
-// CHECK-NEXT:    ret void
+// CHECK:         ret void
 //
 void multiply_matrix_matrix_int(ix9x3_t b, ix3x9_t c) {
   ix9x9_t a;
@@ -874,6 +875,8 @@ void insert_float_matrix_idx_i_u_float(fx2x3_t b, float e, int j, unsigned k) {
   // CHECK-NEXT:    [[K_EXT:%.*]] = zext i32 [[K]] to i64
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K_EXT]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4
@@ -890,6 +893,8 @@ void insert_float_matrix_idx_s_ull_float(fx2x3_t b, float e, short j, unsigned l
   // CHECK-NEXT:    [[K:%.*]] = load i64, i64* %k.addr, align 8
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4
@@ -907,6 +912,8 @@ void insert_int_idx_expr(ix9x3_t a, int i) {
   // CHECK-NEXT:    [[I2_ADD:%.*]] = add nsw i32 4, [[I2]]
   // CHECK-NEXT:    [[ADD_EXT:%.*]] = sext i32 [[I2_ADD]] to i64
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 18, [[ADD_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <27 x i32> [[MAT]], i32 [[I1]], i64 [[IDX2]]
   // CHECK-NEXT:    store <27 x i32> [[MATINS]], <27 x i32>* [[MAT_ADDR]], align 4
@@ -980,9 +987,11 @@ int extract_int(ix9x3_t c, unsigned long j) {
   // CHECK-LABEL: @extract_int(
   // CHECK:         [[J1:%.*]] = load i64, i64* %j.addr, align 8
   // CHECK-NEXT:    [[J2:%.*]] = load i64, i64* %j.addr, align 8
-  // CHECK-NEXT:    [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J2]], 9
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J1]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <27 x i32> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    ret i32 [[MATEXT]]
 
@@ -995,13 +1004,15 @@ double test_extract_matrix_pointer1(dx3x2_t **ptr, unsigned j) {
   // CHECK-LABEL: @test_extract_matrix_pointer1(
   // CHECK:         [[J:%.*]] = load i32, i32* %j.addr, align 4
   // CHECK-NEXT:    [[J_EXT:%.*]] = zext i32 [[J]] to i64
+  // CHECK-NEXT:    [[IDX:%.*]] = add i64 3, [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[PTR:%.*]] = load [6 x double]**, [6 x double]*** %ptr.addr, align 8
   // CHECK-NEXT:    [[PTR_IDX:%.*]] = getelementptr inbounds [6 x double]*, [6 x double]** [[PTR]], i64 1
   // CHECK-NEXT:    [[PTR2:%.*]] = load [6 x double]*, [6 x double]** [[PTR_IDX]], align 8
   // CHECK-NEXT:    [[PTR2_IDX:%.*]] = getelementptr inbounds [6 x double], [6 x double]* [[PTR2]], i64 2
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = bitcast [6 x double]* [[PTR2_IDX]] to <6 x double>*
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x double>, <6 x double>* [[MAT_ADDR]], align 8
-  // CHECK-NEXT:    [[IDX:%.*]] = add i64 3, [[J_EXT]]
   // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 [[IDX]]
   // CHECK-NEXT:    ret double [[MATEXT]]
 
@@ -1027,13 +1038,17 @@ void insert_extract(dx5x5_t a, fx3x3_t b, unsigned long j, short k) {
   // CHECK-LABEL: @insert_extract(
   // CHECK:         [[K:%.*]] = load i16, i16* %k.addr, align 2
   // CHECK-NEXT:    [[K_EXT:%.*]] = sext i16 [[K]] to i64
-  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K_EXT]], 3
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], 0
-  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    [[J:%.*]] = load i64, i64* %j.addr, align 8
   // CHECK-NEXT:    [[IDX3:%.*]] = mul i64 [[J]], 3
   // CHECK-NEXT:    [[IDX4:%.*]] = add i64 [[IDX3]], 2
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX4]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT2:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], i64 [[IDX4]]
   // CHECK-NEXT:    store <9 x float> [[MATINS]], <9 x float>* [[MAT_ADDR]], align 4
@@ -1068,9 +1083,13 @@ void insert_compound_stmt_field(struct Foo *a, float f, unsigned i, unsigned j)
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:    [[MAT_PTR:%.*]] = bitcast [6 x float]* %mat to <6 x float>*
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4
   // CHECK-NEXT:    [[EXT:%.*]] = extractelement <6 x float> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    [[SUM:%.*]] = fadd float [[EXT]], {{.*}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT2:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4
   // CHECK-NEXT:    [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], i64 [[IDX2]]
   // CHECK-NEXT:    store <6 x float> [[INS]], <6 x float>* [[MAT_PTR]], align 4
@@ -1085,23 +1104,29 @@ void matrix_as_idx(ix9x3_t a, int i, int j, dx5x5_t b) {
   // CHECK-NEXT:  [[I1_EXT:%.*]] = sext i32 [[I1]] to i64
   // CHECK-NEXT:  [[J1:%.*]] = load i32, i32* %j.addr, align 4
   // CHECK-NEXT:  [[J1_EXT:%.*]] = sext i32 [[J1]] to i64
-  // CHECK-NEXT:  [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4
   // CHECK-NEXT:  [[IDX1_1:%.*]] = mul i64 [[J1_EXT]], 9
   // CHECK-NEXT:  [[IDX1_2:%.*]] = add i64 [[IDX1_1]], [[I1_EXT]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX1_2]], 27
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:  [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4
   // CHECK-NEXT:  [[MI1:%.*]] = extractelement <27 x i32> [[A]], i64 [[IDX1_2]]
   // CHECK-NEXT:  [[MI1_EXT:%.*]] = sext i32 [[MI1]] to i64
   // CHECK-NEXT:  [[J2:%.*]] = load i32, i32* %j.addr, align 4
   // CHECK-NEXT:  [[J2_EXT:%.*]] = sext i32 [[J2]] to i64
   // CHECK-NEXT:  [[I2:%.*]] = load i32, i32* %i.addr, align 4
   // CHECK-NEXT:  [[I2_EXT:%.*]] = sext i32 [[I2]] to i64
-  // CHECK-NEXT:  [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:  [[IDX2_1:%.*]] = mul i64 [[I2_EXT]], 9
   // CHECK-NEXT:  [[IDX2_2:%.*]] = add i64 [[IDX2_1]], [[J2_EXT]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX2_2]], 27
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:  [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:  [[MI2:%.*]] = extractelement <27 x i32> [[A2]], i64 [[IDX2_2]]
   // CHECK-NEXT:  [[MI3:%.*]] = add nsw i32 [[MI2]], 2
   // CHECK-NEXT:  [[MI3_EXT:%.*]] = sext i32 [[MI3]] to i64
   // CHECK-NEXT:  [[IDX3_1:%.*]] = mul i64 [[MI3_EXT]], 5
   // CHECK-NEXT:  [[IDX3_2:%.*]] = add i64 [[IDX3_1]], [[MI1_EXT]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX3_2]], 25
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:  [[B:%.*]] = load <25 x double>, <25 x double>* [[B_PTR:%.*]], align 8
   // CHECK-NEXT:  [[INS:%.*]] = insertelement <25 x double> [[B]], double 1.500000e+00, i64 [[IDX3_2]]
   // CHECK-NEXT:  store <25 x double> [[INS]], <25 x double>* [[B_PTR]], align 8
diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck --check-prefixes=CHECK,OPT %s
 
 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
 using fx2x3_t = float __attribute__((matrix_type(2, 3)));
@@ -94,7 +95,7 @@ struct DoubleWrapper2 {
 
 void test_DoubleWrapper2_Add1(MyMatrix<double, 10, 9> &m) {
   // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add1R8MyMatrixIdLj10ELj9EE(
-  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.+}}, align 8
   // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2)
   // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0
   // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer
@@ -109,7 +110,7 @@ void test_DoubleWrapper2_Add1(MyMatrix<double, 10, 9> &m) {
 void test_DoubleWrapper2_Add2(MyMatrix<double, 10, 9> &m) {
   // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add2R8MyMatrixIdLj10ELj9EE(
   // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2)
-  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8
   // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0
   // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer
   // CHECK-NEXT:  [[RES:%.*]] = fadd <90 x double> [[SCALAR_EMBED1]], [[MATRIX]]
@@ -219,6 +220,8 @@ void test_insert_template1(MyMatrix<unsigned, 2, 2> &Mat, unsigned e, unsigned i
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = bitcast [4 x i32]* {{.*}} to <4 x i32>*
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 4
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <4 x i32>, <4 x i32>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <4 x i32> [[MATINS]], <4 x i32>* [[MAT_ADDR]], align 4
@@ -243,6 +246,8 @@ void test_insert_template2(MyMatrix<float, 3, 8> &Mat, float e) {
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 3
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = bitcast [24 x float]* {{.*}} to <24 x float>*
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 24
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <24 x float>, <24 x float>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <24 x float> [[MAT]], float [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <24 x float> [[MATINS]], <24 x float>* [[MAT_ADDR]], align 4
@@ -292,10 +297,10 @@ const double &test_matrix_subscript_reference(const double4x4 m) {
   // CHECK-NEXT:    [[REF_TMP:%.*]] = alloca double, align 8
   // CHECK-NEXT:    [[NAMELESS0:%.*]] = bitcast [16 x double]* [[M_ADDR]] to <16 x double>*
   // CHECK-NEXT:    store <16 x double> [[M:%.*]], <16 x double>* [[NAMELESS0]], align 8
-  // CHECK-NEXT:    [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8
+  // CHECK:         [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8
   // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[NAMELESS1]], i64 4
   // CHECK-NEXT:    store double [[MATEXT]], double* [[REF_TMP]], align 8
-  // CHECK-NEXT:    ret double* [[REF_TMP]]
+  // CHECK:         ret double* [[REF_TMP]]
 
   return m[0][1];
 }
@@ -315,11 +320,13 @@ double extract_IntWrapper_idx(double4x4 &m, IntWrapper i, UnsignedWrapper j) {
   // CHECK-NEXT:    [[J:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} %j)
   // CHECK-NEXT:    [[J_SUB:%.*]] = sub i32 [[J]], 1
   // CHECK-NEXT:    [[J_SUB_EXT:%.*]] = zext i32 [[J_SUB]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = load [16 x double]*, [16 x double]** %m.addr, align 8
   // CHECK-NEXT:    [[MAT_ADDR2:%.*]] = bitcast [16 x double]* [[MAT_ADDR]] to <16 x double>*
   // CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, <16 x double>* [[MAT_ADDR2]], align 8
-  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4
-  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]]
   // CHECK-NEXT:    [[MATEXT:%.*]]  = extractelement <16 x double> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    ret double [[MATEXT]]
   return m[i + 1][j - 1];
@@ -358,6 +365,8 @@ void test_constexpr1(matrix_type<float, 4, 4> &m) {
   // CHECK-NEXT:   [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4
   // CHECK-NEXT:   [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:   [[MAT_ADDR:%.*]] = bitcast [16 x float]* %result to <16 x float>*
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:   [[MAT:%.*]] = load <16 x float>, <16 x float>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, i64 [[IDX2]]
   // CHECK-NEXT:   store <16 x float> [[MATINS]], <16 x float>* [[MAT_ADDR]], align 4
@@ -386,6 +395,8 @@ void test_constexpr2(matrix_type<int, 5, 5> &m) {
   // CHECK-NEXT:   [[IDX1:%.*]] = mul i64 [[I2_EXT]], 5
   // CHECK-NEXT:   [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:   [[MAT_ADDR:%.*]] = bitcast [25 x i32]* %result to <25 x i32>*
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult i64 [[IDX2]], 25
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:   [[MAT:%.*]] = load <25 x i32>, <25 x i32>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <25 x i32> [[MAT]], i32 1, i64 [[IDX2]]
   // CHECK-NEXT:   store <25 x i32> [[MATINS]], <25 x i32>* [[MAT_ADDR]], align 4
diff --git a/clang/test/CodeGenObjC/matrix-type-operators.m b/clang/test/CodeGenObjC/matrix-type-operators.m
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h