Skip to content

Commit ea21d68

Browse files
committed
[Matrix] Emit assumption that matrix indices are valid.
The matrix extension requires the indices for matrix subscript expression to be valid and it is UB otherwise. extract/insertelement produce poison if the index is invalid, which limits the optimizer to not be bale to scalarize load/extract pairs for example, which causes very suboptimal code to be generated when using matrix subscript expressions with variable indices for large matrixes. This patch updates IRGen to emit assumes to for index expression to convey the information that the index must be valid. This also adjusts the order in which operations are emitted slightly, so indices & assumes are added before the load of the matrix value. Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D102478
1 parent 9f34f75 commit ea21d68

File tree

6 files changed

+99
-34
lines changed

6 files changed

+99
-34
lines changed

clang/lib/CodeGen/CGExpr.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "llvm/IR/Intrinsics.h"
3636
#include "llvm/IR/LLVMContext.h"
3737
#include "llvm/IR/MDBuilder.h"
38+
#include "llvm/IR/MatrixBuilder.h"
3839
#include "llvm/Support/ConvertUTF.h"
3940
#include "llvm/Support/MathExtras.h"
4041
#include "llvm/Support/Path.h"
@@ -1939,10 +1940,15 @@ RValue CodeGenFunction::EmitLoadOfLValue(LValue LV, SourceLocation Loc) {
19391940
return EmitLoadOfGlobalRegLValue(LV);
19401941

19411942
if (LV.isMatrixElt()) {
1943+
llvm::Value *Idx = LV.getMatrixIdx();
1944+
if (CGM.getCodeGenOpts().OptimizationLevel > 0) {
1945+
const auto *const MatTy = LV.getType()->getAs<ConstantMatrixType>();
1946+
llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
1947+
MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened());
1948+
}
19421949
llvm::LoadInst *Load =
19431950
Builder.CreateLoad(LV.getMatrixAddress(), LV.isVolatileQualified());
1944-
return RValue::get(
1945-
Builder.CreateExtractElement(Load, LV.getMatrixIdx(), "matrixext"));
1951+
return RValue::get(Builder.CreateExtractElement(Load, Idx, "matrixext"));
19461952
}
19471953

19481954
assert(LV.isBitField() && "Unknown LValue type!");
@@ -2080,9 +2086,15 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst,
20802086
return EmitStoreThroughGlobalRegLValue(Src, Dst);
20812087

20822088
if (Dst.isMatrixElt()) {
2083-
llvm::Value *Vec = Builder.CreateLoad(Dst.getMatrixAddress());
2084-
Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(),
2085-
Dst.getMatrixIdx(), "matins");
2089+
llvm::Value *Idx = Dst.getMatrixIdx();
2090+
if (CGM.getCodeGenOpts().OptimizationLevel > 0) {
2091+
const auto *const MatTy = Dst.getType()->getAs<ConstantMatrixType>();
2092+
llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
2093+
MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened());
2094+
}
2095+
llvm::Instruction *Load = Builder.CreateLoad(Dst.getMatrixAddress());
2096+
llvm::Value *Vec =
2097+
Builder.CreateInsertElement(Load, Src.getScalarVal(), Idx, "matins");
20862098
Builder.CreateStore(Vec, Dst.getMatrixAddress(),
20872099
Dst.isVolatileQualified());
20882100
return;

clang/lib/CodeGen/CGExprScalar.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,13 +1775,18 @@ Value *ScalarExprEmitter::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *E) {
17751775
// integer value.
17761776
Value *RowIdx = Visit(E->getRowIdx());
17771777
Value *ColumnIdx = Visit(E->getColumnIdx());
1778+
1779+
const auto *MatrixTy = E->getBase()->getType()->castAs<ConstantMatrixType>();
1780+
unsigned NumRows = MatrixTy->getNumRows();
1781+
llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
1782+
Value *Idx = MB.CreateIndex(RowIdx, ColumnIdx, NumRows);
1783+
if (CGF.CGM.getCodeGenOpts().OptimizationLevel > 0)
1784+
MB.CreateIndexAssumption(Idx, MatrixTy->getNumElementsFlattened());
1785+
17781786
Value *Matrix = Visit(E->getBase());
17791787

17801788
// TODO: Should we emit bounds checks with SanitizerKind::ArrayBounds?
1781-
llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
1782-
return MB.CreateExtractElement(
1783-
Matrix, RowIdx, ColumnIdx,
1784-
E->getBase()->getType()->castAs<ConstantMatrixType>()->getNumRows());
1789+
return Builder.CreateExtractElement(Matrix, Idx, "matrixext");
17851790
}
17861791

17871792
static int getMaskElt(llvm::ShuffleVectorInst *SVI, unsigned Idx,

clang/test/CodeGen/matrix-type-operators.c

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
1+
// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK %s
2+
// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK,OPT %s
23

34
typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
45
typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
@@ -506,7 +507,7 @@ void multiply_matrix_matrix_double(dx5x5_t b, dx5x5_t c) {
506507
// CHECK-NEXT: [[RES:%.*]] = call <25 x double> @llvm.matrix.multiply.v25f64.v25f64.v25f64(<25 x double> [[B]], <25 x double> [[C]], i32 5, i32 5, i32 5)
507508
// CHECK-NEXT: [[A_ADDR:%.*]] = bitcast [25 x double]* %a to <25 x double>*
508509
// CHECK-NEXT: store <25 x double> [[RES]], <25 x double>* [[A_ADDR]], align 8
509-
// CHECK-NEXT: ret void
510+
// CHECK: ret void
510511
//
511512

512513
dx5x5_t a;
@@ -531,7 +532,7 @@ typedef int ix9x9_t __attribute__((matrix_type(9, 9)));
531532
// CHECK-NEXT: [[RES:%.*]] = call <81 x i32> @llvm.matrix.multiply.v81i32.v27i32.v27i32(<27 x i32> [[B]], <27 x i32> [[C]], i32 9, i32 3, i32 9)
532533
// CHECK-NEXT: [[A_ADDR:%.*]] = bitcast [81 x i32]* %a to <81 x i32>*
533534
// CHECK-NEXT: store <81 x i32> [[RES]], <81 x i32>* [[A_ADDR]], align 4
534-
// CHECK-NEXT: ret void
535+
// CHECK: ret void
535536
//
536537
void multiply_matrix_matrix_int(ix9x3_t b, ix3x9_t c) {
537538
ix9x9_t a;
@@ -874,6 +875,8 @@ void insert_float_matrix_idx_i_u_float(fx2x3_t b, float e, int j, unsigned k) {
874875
// CHECK-NEXT: [[K_EXT:%.*]] = zext i32 [[K]] to i64
875876
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 2
876877
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
878+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
879+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
877880
// CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4
878881
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
879882
// CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4
@@ -890,6 +893,8 @@ void insert_float_matrix_idx_s_ull_float(fx2x3_t b, float e, short j, unsigned l
890893
// CHECK-NEXT: [[K:%.*]] = load i64, i64* %k.addr, align 8
891894
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K]], 2
892895
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
896+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
897+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
893898
// CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4
894899
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
895900
// CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4
@@ -907,6 +912,8 @@ void insert_int_idx_expr(ix9x3_t a, int i) {
907912
// CHECK-NEXT: [[I2_ADD:%.*]] = add nsw i32 4, [[I2]]
908913
// CHECK-NEXT: [[ADD_EXT:%.*]] = sext i32 [[I2_ADD]] to i64
909914
// CHECK-NEXT: [[IDX2:%.*]] = add i64 18, [[ADD_EXT]]
915+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27
916+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
910917
// CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4
911918
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <27 x i32> [[MAT]], i32 [[I1]], i64 [[IDX2]]
912919
// CHECK-NEXT: store <27 x i32> [[MATINS]], <27 x i32>* [[MAT_ADDR]], align 4
@@ -980,9 +987,11 @@ int extract_int(ix9x3_t c, unsigned long j) {
980987
// CHECK-LABEL: @extract_int(
981988
// CHECK: [[J1:%.*]] = load i64, i64* %j.addr, align 8
982989
// CHECK-NEXT: [[J2:%.*]] = load i64, i64* %j.addr, align 8
983-
// CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
984990
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J2]], 9
985991
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J1]]
992+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27
993+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
994+
// CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
986995
// CHECK-NEXT: [[MATEXT:%.*]] = extractelement <27 x i32> [[MAT]], i64 [[IDX2]]
987996
// CHECK-NEXT: ret i32 [[MATEXT]]
988997

@@ -995,13 +1004,15 @@ double test_extract_matrix_pointer1(dx3x2_t **ptr, unsigned j) {
9951004
// CHECK-LABEL: @test_extract_matrix_pointer1(
9961005
// CHECK: [[J:%.*]] = load i32, i32* %j.addr, align 4
9971006
// CHECK-NEXT: [[J_EXT:%.*]] = zext i32 [[J]] to i64
1007+
// CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]]
1008+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 6
1009+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
9981010
// CHECK-NEXT: [[PTR:%.*]] = load [6 x double]**, [6 x double]*** %ptr.addr, align 8
9991011
// CHECK-NEXT: [[PTR_IDX:%.*]] = getelementptr inbounds [6 x double]*, [6 x double]** [[PTR]], i64 1
10001012
// CHECK-NEXT: [[PTR2:%.*]] = load [6 x double]*, [6 x double]** [[PTR_IDX]], align 8
10011013
// CHECK-NEXT: [[PTR2_IDX:%.*]] = getelementptr inbounds [6 x double], [6 x double]* [[PTR2]], i64 2
10021014
// CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [6 x double]* [[PTR2_IDX]] to <6 x double>*
10031015
// CHECK-NEXT: [[MAT:%.*]] = load <6 x double>, <6 x double>* [[MAT_ADDR]], align 8
1004-
// CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]]
10051016
// CHECK-NEXT: [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 [[IDX]]
10061017
// CHECK-NEXT: ret double [[MATEXT]]
10071018

@@ -1027,13 +1038,17 @@ void insert_extract(dx5x5_t a, fx3x3_t b, unsigned long j, short k) {
10271038
// CHECK-LABEL: @insert_extract(
10281039
// CHECK: [[K:%.*]] = load i16, i16* %k.addr, align 2
10291040
// CHECK-NEXT: [[K_EXT:%.*]] = sext i16 [[K]] to i64
1030-
// CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4
10311041
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 3
10321042
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], 0
1033-
// CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX]]
1043+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 9
1044+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
1045+
// CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4
1046+
// CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX2]]
10341047
// CHECK-NEXT: [[J:%.*]] = load i64, i64* %j.addr, align 8
10351048
// CHECK-NEXT: [[IDX3:%.*]] = mul i64 [[J]], 3
10361049
// CHECK-NEXT: [[IDX4:%.*]] = add i64 [[IDX3]], 2
1050+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX4]], 9
1051+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
10371052
// CHECK-NEXT: [[MAT2:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR]], align 4
10381053
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], i64 [[IDX4]]
10391054
// CHECK-NEXT: store <9 x float> [[MATINS]], <9 x float>* [[MAT_ADDR]], align 4
@@ -1068,9 +1083,13 @@ void insert_compound_stmt_field(struct Foo *a, float f, unsigned i, unsigned j)
10681083
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
10691084
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
10701085
// CHECK-NEXT: [[MAT_PTR:%.*]] = bitcast [6 x float]* %mat to <6 x float>*
1086+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
1087+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
10711088
// CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4
10721089
// CHECK-NEXT: [[EXT:%.*]] = extractelement <6 x float> [[MAT]], i64 [[IDX2]]
10731090
// CHECK-NEXT: [[SUM:%.*]] = fadd float [[EXT]], {{.*}}
1091+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
1092+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
10741093
// CHECK-NEXT: [[MAT2:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4
10751094
// CHECK-NEXT: [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], i64 [[IDX2]]
10761095
// CHECK-NEXT: store <6 x float> [[INS]], <6 x float>* [[MAT_PTR]], align 4
@@ -1085,23 +1104,29 @@ void matrix_as_idx(ix9x3_t a, int i, int j, dx5x5_t b) {
10851104
// CHECK-NEXT: [[I1_EXT:%.*]] = sext i32 [[I1]] to i64
10861105
// CHECK-NEXT: [[J1:%.*]] = load i32, i32* %j.addr, align 4
10871106
// CHECK-NEXT: [[J1_EXT:%.*]] = sext i32 [[J1]] to i64
1088-
// CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4
10891107
// CHECK-NEXT: [[IDX1_1:%.*]] = mul i64 [[J1_EXT]], 9
10901108
// CHECK-NEXT: [[IDX1_2:%.*]] = add i64 [[IDX1_1]], [[I1_EXT]]
1109+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX1_2]], 27
1110+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
1111+
// CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4
10911112
// CHECK-NEXT: [[MI1:%.*]] = extractelement <27 x i32> [[A]], i64 [[IDX1_2]]
10921113
// CHECK-NEXT: [[MI1_EXT:%.*]] = sext i32 [[MI1]] to i64
10931114
// CHECK-NEXT: [[J2:%.*]] = load i32, i32* %j.addr, align 4
10941115
// CHECK-NEXT: [[J2_EXT:%.*]] = sext i32 [[J2]] to i64
10951116
// CHECK-NEXT: [[I2:%.*]] = load i32, i32* %i.addr, align 4
10961117
// CHECK-NEXT: [[I2_EXT:%.*]] = sext i32 [[I2]] to i64
1097-
// CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
10981118
// CHECK-NEXT: [[IDX2_1:%.*]] = mul i64 [[I2_EXT]], 9
10991119
// CHECK-NEXT: [[IDX2_2:%.*]] = add i64 [[IDX2_1]], [[J2_EXT]]
1120+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2_2]], 27
1121+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
1122+
// CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
11001123
// CHECK-NEXT: [[MI2:%.*]] = extractelement <27 x i32> [[A2]], i64 [[IDX2_2]]
11011124
// CHECK-NEXT: [[MI3:%.*]] = add nsw i32 [[MI2]], 2
11021125
// CHECK-NEXT: [[MI3_EXT:%.*]] = sext i32 [[MI3]] to i64
11031126
// CHECK-NEXT: [[IDX3_1:%.*]] = mul i64 [[MI3_EXT]], 5
11041127
// CHECK-NEXT: [[IDX3_2:%.*]] = add i64 [[IDX3_1]], [[MI1_EXT]]
1128+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX3_2]], 25
1129+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
11051130
// CHECK-NEXT: [[B:%.*]] = load <25 x double>, <25 x double>* [[B_PTR:%.*]], align 8
11061131
// CHECK-NEXT: [[INS:%.*]] = insertelement <25 x double> [[B]], double 1.500000e+00, i64 [[IDX3_2]]
11071132
// CHECK-NEXT: store <25 x double> [[INS]], <25 x double>* [[B_PTR]], align 8

clang/test/CodeGenCXX/matrix-type-operators.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
1+
// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
2+
// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck --check-prefixes=CHECK,OPT %s
23

34
typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
45
using fx2x3_t = float __attribute__((matrix_type(2, 3)));
@@ -94,7 +95,7 @@ struct DoubleWrapper2 {
9495

9596
void test_DoubleWrapper2_Add1(MyMatrix<double, 10, 9> &m) {
9697
// CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add1R8MyMatrixIdLj10ELj9EE(
97-
// CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
98+
// CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.+}}, align 8
9899
// CHECK: [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2)
99100
// CHECK-NEXT: [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0
100101
// CHECK-NEXT: [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer
@@ -109,7 +110,7 @@ void test_DoubleWrapper2_Add1(MyMatrix<double, 10, 9> &m) {
109110
void test_DoubleWrapper2_Add2(MyMatrix<double, 10, 9> &m) {
110111
// CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add2R8MyMatrixIdLj10ELj9EE(
111112
// CHECK: [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2)
112-
// CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
113+
// CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8
113114
// CHECK-NEXT: [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0
114115
// CHECK-NEXT: [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer
115116
// CHECK-NEXT: [[RES:%.*]] = fadd <90 x double> [[SCALAR_EMBED1]], [[MATRIX]]
@@ -219,6 +220,8 @@ void test_insert_template1(MyMatrix<unsigned, 2, 2> &Mat, unsigned e, unsigned i
219220
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
220221
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
221222
// CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [4 x i32]* {{.*}} to <4 x i32>*
223+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 4
224+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
222225
// CHECK-NEXT: [[MAT:%.*]] = load <4 x i32>, <4 x i32>* [[MAT_ADDR]], align 4
223226
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], i64 [[IDX2]]
224227
// CHECK-NEXT: store <4 x i32> [[MATINS]], <4 x i32>* [[MAT_ADDR]], align 4
@@ -243,6 +246,8 @@ void test_insert_template2(MyMatrix<float, 3, 8> &Mat, float e) {
243246
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 3
244247
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
245248
// CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [24 x float]* {{.*}} to <24 x float>*
249+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 24
250+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
246251
// CHECK-NEXT: [[MAT:%.*]] = load <24 x float>, <24 x float>* [[MAT_ADDR]], align 4
247252
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <24 x float> [[MAT]], float [[E]], i64 [[IDX2]]
248253
// CHECK-NEXT: store <24 x float> [[MATINS]], <24 x float>* [[MAT_ADDR]], align 4
@@ -292,10 +297,10 @@ const double &test_matrix_subscript_reference(const double4x4 m) {
292297
// CHECK-NEXT: [[REF_TMP:%.*]] = alloca double, align 8
293298
// CHECK-NEXT: [[NAMELESS0:%.*]] = bitcast [16 x double]* [[M_ADDR]] to <16 x double>*
294299
// CHECK-NEXT: store <16 x double> [[M:%.*]], <16 x double>* [[NAMELESS0]], align 8
295-
// CHECK-NEXT: [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8
300+
// CHECK: [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8
296301
// CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[NAMELESS1]], i64 4
297302
// CHECK-NEXT: store double [[MATEXT]], double* [[REF_TMP]], align 8
298-
// CHECK-NEXT: ret double* [[REF_TMP]]
303+
// CHECK: ret double* [[REF_TMP]]
299304

300305
return m[0][1];
301306
}
@@ -315,11 +320,13 @@ double extract_IntWrapper_idx(double4x4 &m, IntWrapper i, UnsignedWrapper j) {
315320
// CHECK-NEXT: [[J:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} %j)
316321
// CHECK-NEXT: [[J_SUB:%.*]] = sub i32 [[J]], 1
317322
// CHECK-NEXT: [[J_SUB_EXT:%.*]] = zext i32 [[J_SUB]] to i64
323+
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4
324+
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]]
325+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
326+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
318327
// CHECK-NEXT: [[MAT_ADDR:%.*]] = load [16 x double]*, [16 x double]** %m.addr, align 8
319328
// CHECK-NEXT: [[MAT_ADDR2:%.*]] = bitcast [16 x double]* [[MAT_ADDR]] to <16 x double>*
320329
// CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* [[MAT_ADDR2]], align 8
321-
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4
322-
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]]
323330
// CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]]
324331
// CHECK-NEXT: ret double [[MATEXT]]
325332
return m[i + 1][j - 1];
@@ -358,6 +365,8 @@ void test_constexpr1(matrix_type<float, 4, 4> &m) {
358365
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4
359366
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
360367
// CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [16 x float]* %result to <16 x float>*
368+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
369+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
361370
// CHECK-NEXT: [[MAT:%.*]] = load <16 x float>, <16 x float>* [[MAT_ADDR]], align 4
362371
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, i64 [[IDX2]]
363372
// CHECK-NEXT: store <16 x float> [[MATINS]], <16 x float>* [[MAT_ADDR]], align 4
@@ -386,6 +395,8 @@ void test_constexpr2(matrix_type<int, 5, 5> &m) {
386395
// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 5
387396
// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
388397
// CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [25 x i32]* %result to <25 x i32>*
398+
// OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 25
399+
// OPT-NEXT: call void @llvm.assume(i1 [[CMP]])
389400
// CHECK-NEXT: [[MAT:%.*]] = load <25 x i32>, <25 x i32>* [[MAT_ADDR]], align 4
390401
// CHECK-NEXT: [[MATINS:%.*]] = insertelement <25 x i32> [[MAT]], i32 1, i64 [[IDX2]]
391402
// CHECK-NEXT: store <25 x i32> [[MATINS]], <25 x i32>* [[MAT_ADDR]], align 4

0 commit comments

Comments
 (0)