Skip to content

Commit a1ef81d

Browse files
committed
[Matrix] Overload stride arg in matrix.columnwise.load/store.
This patch adjusts the intrinsics definition of llvm.matrix.column.major.load and llvm.matrix.column.major.store to allow overloading the type of the stride. The bitwidth of the stride is used to perform the offset computation. This fixes a crash when using __builtin_matrix_column_major_load or __builtin_matrix_column_major_store on 32 bit platforms. The stride argument of the builtins are defined as `size_t`, which is 32 bits wide on 32 bit platforms. Note that we still perform offset computations with 64 bit width on 32 bit platforms for accesses that do not take a user-specified stride. This can be fixed separately. Fixes PR51304. Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D107349
1 parent 9c47d6b commit a1ef81d

File tree

10 files changed

+289
-182
lines changed

10 files changed

+289
-182
lines changed

clang/test/CodeGen/matrix-type-builtins.c

Lines changed: 183 additions & 117 deletions
Large diffs are not rendered by default.

clang/test/CodeGenCXX/matrix-type-builtins.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ void test_column_major_load_with_stride_template_double(double *Ptr) {
9494

9595
// CHECK-LABEL: define linkonce_odr <40 x double> @_Z29column_major_load_with_strideIdLj10ELj4ELj15EEu11matrix_typeIXT0_EXT1_ET_EPS0_(double* %Ptr)
9696
// CHECK: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
97-
// CHECK-NEXT: call <40 x double> @llvm.matrix.column.major.load.v40f64(double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
97+
// CHECK-NEXT: call <40 x double> @llvm.matrix.column.major.load.v40f64.i64(double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
9898

9999
matrix_t<double, 10, 4> M1 = column_major_load_with_stride<double, 10, 4, 15>(Ptr);
100100
}
@@ -106,7 +106,7 @@ void test_column_major_load_with_stride_template_int(int *Ptr) {
106106

107107
// CHECK-LABEL: define linkonce_odr <6 x i32> @_Z29column_major_load_with_strideIiLj3ELj2ELj12EEu11matrix_typeIXT0_EXT1_ET_EPS0_(i32* %Ptr)
108108
// CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
109-
// CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32(i32* align 4 [[PTR]], i64 12, i1 false, i32 3, i32 2)
109+
// CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32.i64(i32* align 4 [[PTR]], i64 12, i1 false, i32 3, i32 2)
110110

111111
matrix_t<int, 3, 2> M1 = column_major_load_with_stride<int, 3, 2, 12>(Ptr);
112112
}
@@ -124,7 +124,7 @@ void test_column_major_load_stride_wrapper(int *Ptr, UnsignedWrapper &W) {
124124
// CHECK-NEXT: [[STRIDE:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} [[W]])
125125
// CHECK-NEXT: [[STRIDE_EXT:%.*]] = zext i32 [[STRIDE]] to i64
126126
// CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
127-
// CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
127+
// CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
128128
matrix_t<int, 2, 2> M1 = __builtin_matrix_column_major_load(Ptr, 2, 2, W);
129129
}
130130

@@ -133,7 +133,7 @@ constexpr int constexpr3() { return 3; }
133133
void test_column_major_load_constexpr_num_rows(int *Ptr) {
134134
// CHECK-LABEL: define{{.*}} void @_Z41test_column_major_load_constexpr_num_rowsPi(i32* %Ptr)
135135
// CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
136-
// CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
136+
// CHECK-NEXT: call <6 x i32> @llvm.matrix.column.major.load.v6i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
137137

138138
matrix_t<int, 3, 2> M1 = __builtin_matrix_column_major_load(Ptr, constexpr3(), 2, 3);
139139
}
@@ -143,7 +143,7 @@ constexpr int constexpr1() { return 1; }
143143
void test_column_major_load_constexpr_num_columns(int *Ptr) {
144144
// CHECK-LABEL: define{{.*}} void @_Z44test_column_major_load_constexpr_num_columnsPi(i32* %Ptr)
145145
// CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
146-
// CHECK-NEXT: call <2 x i32> @llvm.matrix.column.major.load.v2i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 1)
146+
// CHECK-NEXT: call <2 x i32> @llvm.matrix.column.major.load.v2i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 1)
147147
matrix_t<int, 2, 1> M1 = __builtin_matrix_column_major_load(Ptr, 2, constexpr1(), 3);
148148
}
149149

@@ -153,7 +153,7 @@ constexpr int constexpr_plus1() { return N + 1; }
153153
void test_column_major_load_constexpr_num_columns_temp(int *Ptr) {
154154
// CHECK-LABEL: define{{.*}} void @_Z49test_column_major_load_constexpr_num_columns_tempPi(i32* %Ptr)
155155
// CHECK: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
156-
// CHECK-NEXT: call <10 x i32> @llvm.matrix.column.major.load.v10i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 5)
156+
// CHECK-NEXT: call <10 x i32> @llvm.matrix.column.major.load.v10i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 5)
157157
matrix_t<int, 2, 5> M1 = __builtin_matrix_column_major_load(Ptr, 2, constexpr_plus1<4>(), 3);
158158
}
159159

@@ -162,7 +162,7 @@ void test_column_major_load_constexpr_stride_constexpr(int *Ptr) {
162162
// CHECK: [[STRIDE:%.*]] = call i32 @_Z10constexpr3v()
163163
// CHECK-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
164164
// CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
165-
// CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
165+
// CHECK-NEXT: call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
166166

167167
matrix_t<int, 2, 2> M1 = __builtin_matrix_column_major_load(Ptr, 2, 2, constexpr3());
168168
}
@@ -200,7 +200,7 @@ void test_column_major_store_with_stride_template_double(double *Ptr) {
200200
// CHECK-LABEL: define linkonce_odr void @_Z30column_major_store_with_strideIdLj10ELj4ELj15EEvRu11matrix_typeIXT0_EXT1_ET_EPS0_([40 x double]* nonnull align 8 dereferenceable(320) %m, double* %Ptr)
201201
// CHECK: [[M:%.*]] = load <40 x double>, <40 x double>* {{.*}}, align 8
202202
// CHECK-NEXT: [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
203-
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v40f64(<40 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
203+
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v40f64.i64(<40 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
204204

205205
matrix_t<double, 10, 4> M1;
206206
column_major_store_with_stride<double, 10, 4, 15>(M1, Ptr);
@@ -214,7 +214,7 @@ void test_column_major_store_with_stride_template_int(int *Ptr) {
214214
// CHECK-LABEL: define linkonce_odr void @_Z30column_major_store_with_strideIiLj3ELj2ELj3EEvRu11matrix_typeIXT0_EXT1_ET_EPS0_([6 x i32]* nonnull align 4 dereferenceable(24) %m, i32* %Ptr)
215215
// CHECK: [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
216216
// CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
217-
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v6i32(<6 x i32> [[M]], i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
217+
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v6i32.i64(<6 x i32> [[M]], i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
218218

219219
matrix_t<int, 3, 2> M1;
220220
column_major_store_with_stride<int, 3, 2, 3>(M1, Ptr);
@@ -227,7 +227,7 @@ void test_column_major_store_stride_wrapper(int *Ptr, UnsignedWrapper &W) {
227227
// CHECK-NEXT: [[W:%.*]] = load %struct.UnsignedWrapper*, %struct.UnsignedWrapper** %W.addr, align 8
228228
// CHECK-NEXT: [[IDX:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} [[W]])
229229
// CHECK-NEXT: [[IDX_EXT:%.*]] = zext i32 [[IDX]] to i64
230-
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
230+
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32.i64(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
231231

232232
matrix_t<int, 2, 2> M1;
233233
__builtin_matrix_column_major_store(M1, Ptr, W);
@@ -239,7 +239,7 @@ void test_column_major_store_constexpr_stride_constexpr(int *Ptr) {
239239
// CHECK-NEXT: [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
240240
// CHECK-NEXT: [[IDX:%.*]] = call i32 @_Z10constexpr3v()
241241
// CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[IDX]] to i64
242-
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
242+
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v4i32.i64(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
243243

244244
matrix_t<int, 2, 2> M;
245245
__builtin_matrix_column_major_store(M, Ptr, constexpr3());

clang/test/CodeGenObjC/matrix-type-builtins.m

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ void test_column_major_load(PtrValue *Ptr, IntValue *Stride) {
5656
// CHECK: [[STRIDE:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)
5757
// CHECK-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
5858
// CHECK: [[PTR:%.*]] = call i32* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32* (i8*, i8*)*)
59-
// CHECK-NEXT: call <12 x i32> @llvm.matrix.column.major.load.v12i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 3, i32 4)
59+
// CHECK-NEXT: call <12 x i32> @llvm.matrix.column.major.load.v12i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 3, i32 4)
6060

6161
u3x4 m = __builtin_matrix_column_major_load(Ptr.value, 3, 4, Stride.value);
6262
}
@@ -67,7 +67,7 @@ void test_column_major_store(UnsignedMatrixValue *M, PtrValue *Ptr, IntValue *St
6767
// CHECK: [[PTR:%.*]] = call i32* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32* (i8*, i8*)*)
6868
// CHECK: [[IDX:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)
6969
// CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[IDX]] to i64
70-
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v12i32(<12 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 3, i32 4)
70+
// CHECK-NEXT: call void @llvm.matrix.column.major.store.v12i32.i64(<12 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 3, i32 4)
7171

7272
__builtin_matrix_column_major_store(M.value, Ptr.value, Stride.value);
7373
}

llvm/docs/LangRef.rst

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17252,11 +17252,12 @@ Overview:
1725217252

1725317253
The '``llvm.matrix.column.major.load.*``' intrinsics load a ``<Rows> x <Cols>``
1725417254
matrix using a stride of ``%Stride`` to compute the start address of the
17255-
different columns. This allows for convenient loading of sub matrixes. If
17256-
``<IsVolatile>`` is true, the intrinsic is considered a :ref:`volatile memory
17257-
access <volatile>`. The result matrix is returned in the result vector. If the
17258-
``%Ptr`` argument is known to be aligned to some boundary, this can be
17259-
specified as an attribute on the argument.
17255+
different columns. The offset is computed using ``%Stride``'s bitwidth. This
17256+
allows for convenient loading of sub matrixes. If ``<IsVolatile>`` is true, the
17257+
intrinsic is considered a :ref:`volatile memory access <volatile>`. The result
17258+
matrix is returned in the result vector. If the ``%Ptr`` argument is known to
17259+
be aligned to some boundary, this can be specified as an attribute on the
17260+
argument.
1726017261

1726117262
Arguments:
1726217263
""""""""""
@@ -17291,7 +17292,8 @@ Overview:
1729117292

1729217293
The '``llvm.matrix.column.major.store.*``' intrinsics store the ``<Rows> x
1729317294
<Cols>`` matrix in ``%In`` to memory using a stride of ``%Stride`` between
17294-
columns. If ``<IsVolatile>`` is true, the intrinsic is considered a
17295+
columns. The offset is computed using ``%Stride``'s bitwidth. If
17296+
``<IsVolatile>`` is true, the intrinsic is considered a
1729517297
:ref:`volatile memory access <volatile>`.
1729617298

1729717299
If the ``%Ptr`` argument is known to be aligned to some boundary, this can be

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1668,7 +1668,7 @@ def int_matrix_multiply
16681668

16691669
def int_matrix_column_major_load
16701670
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
1671-
[LLVMPointerToElt<0>, llvm_i64_ty, llvm_i1_ty,
1671+
[LLVMPointerToElt<0>, llvm_anyint_ty, llvm_i1_ty,
16721672
llvm_i32_ty, llvm_i32_ty],
16731673
[IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrReadMem,
16741674
NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
@@ -1677,7 +1677,7 @@ def int_matrix_column_major_load
16771677
def int_matrix_column_major_store
16781678
: DefaultAttrsIntrinsic<[],
16791679
[llvm_anyvector_ty, LLVMPointerToElt<0>,
1680-
llvm_i64_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
1680+
llvm_anyint_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
16811681
[IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrWriteMem,
16821682
WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
16831683
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;

llvm/include/llvm/IR/MatrixBuilder.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ template <class IRBuilderTy> class MatrixBuilder {
7474

7575
Value *Ops[] = {DataPtr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows),
7676
B.getInt32(Columns)};
77-
Type *OverloadedTypes[] = {RetType};
77+
Type *OverloadedTypes[] = {RetType, Stride->getType()};
7878

7979
Function *TheFn = Intrinsic::getDeclaration(
8080
getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes);
@@ -97,7 +97,7 @@ template <class IRBuilderTy> class MatrixBuilder {
9797
Value *Ops[] = {Matrix, Ptr,
9898
Stride, B.getInt1(IsVolatile),
9999
B.getInt32(Rows), B.getInt32(Columns)};
100-
Type *OverloadedTypes[] = {Matrix->getType()};
100+
Type *OverloadedTypes[] = {Matrix->getType(), Stride->getType()};
101101

102102
Function *TheFn = Intrinsic::getDeclaration(
103103
getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes);

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -981,8 +981,9 @@ class LowerMatrixIntrinsics {
981981
Value *EltPtr = createElementPtr(Ptr, EltTy, Builder);
982982
MatrixTy Result;
983983
for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
984-
Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
985-
Shape.getStride(), EltTy, Builder);
984+
Value *GEP = computeVectorAddr(
985+
EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),
986+
Stride, Shape.getStride(), EltTy, Builder);
986987
Value *Vector = Builder.CreateAlignedLoad(
987988
VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign),
988989
IsVolatile, "col.load");
@@ -1071,9 +1072,11 @@ class LowerMatrixIntrinsics {
10711072
auto VType = cast<VectorType>(Ty);
10721073
Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
10731074
for (auto Vec : enumerate(StoreVal.vectors())) {
1074-
Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
1075-
Stride, StoreVal.getStride(),
1076-
VType->getElementType(), Builder);
1075+
Value *GEP = computeVectorAddr(
1076+
EltPtr,
1077+
Builder.getIntN(Stride->getType()->getScalarSizeInBits(),
1078+
Vec.index()),
1079+
Stride, StoreVal.getStride(), VType->getElementType(), Builder);
10771080
Builder.CreateAlignedStore(Vec.value(), GEP,
10781081
getAlignForIndex(Vec.index(), Stride,
10791082
VType->getElementType(),

llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ define <9 x double> @strided_load_3x3(double* %in, i64 %stride) {
2323
; CHECK-NEXT: ret <9 x double> [[TMP2]]
2424
;
2525
entry:
26-
%load = call <9 x double> @llvm.matrix.column.major.load(double* %in, i64 %stride, i1 false, i32 3, i32 3)
26+
%load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double* %in, i64 %stride, i1 false, i32 3, i32 3)
2727
ret <9 x double> %load
2828
}
2929

30-
declare <9 x double> @llvm.matrix.column.major.load(double*, i64, i1, i32, i32)
30+
declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double*, i64, i1, i32, i32)
3131

3232
define <9 x double> @strided_load_9x1(double* %in, i64 %stride) {
3333
; CHECK-LABEL: @strided_load_9x1(
@@ -39,12 +39,11 @@ define <9 x double> @strided_load_9x1(double* %in, i64 %stride) {
3939
; CHECK-NEXT: ret <9 x double> [[COL_LOAD]]
4040
;
4141
entry:
42-
%load = call <9 x double> @llvm.matrix.column.major.load(double* %in, i64 %stride, i1 false, i32 9, i32 1)
42+
%load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double* %in, i64 %stride, i1 false, i32 9, i32 1)
4343
ret <9 x double> %load
4444
}
4545

46-
declare <8 x double> @llvm.matrix.column.major.load.v8f64(double*, i64, i1, i32, i32)
47-
; CHECK: declare <8 x double> @llvm.matrix.column.major.load.v8f64(double* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY:#[0-9]]]
46+
declare <8 x double> @llvm.matrix.column.major.load.v8f64.i64(double*, i64, i1, i32, i32)
4847

4948
define <8 x double> @strided_load_4x2(double* %in, i64 %stride) {
5049
; CHECK-LABEL: @strided_load_4x2(
@@ -61,9 +60,27 @@ define <8 x double> @strided_load_4x2(double* %in, i64 %stride) {
6160
; CHECK-NEXT: ret <8 x double> [[TMP0]]
6261
;
6362
entry:
64-
%load = call <8 x double> @llvm.matrix.column.major.load.v8f64(double* %in, i64 %stride, i1 false, i32 4, i32 2)
63+
%load = call <8 x double> @llvm.matrix.column.major.load.v8f64.i64(double* %in, i64 %stride, i1 false, i32 4, i32 2)
6564
ret <8 x double> %load
6665
}
6766

68-
; CHECK: declare <9 x double> @llvm.matrix.column.major.load.v9f64(double* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY]]
69-
; CHECK: attributes [[READONLY]] = { argmemonly nofree nosync nounwind readonly willreturn }
67+
declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(double*, i32, i1, i32, i32)
68+
69+
define <8 x double> @strided_load_4x2_stride_i32(double* %in, i32 %stride) {
70+
; CHECK-LABEL: @strided_load_4x2_stride_i32(
71+
; CHECK-NEXT: entry:
72+
; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
73+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[IN:%.*]], i32 [[VEC_START]]
74+
; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>*
75+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST]], align 8
76+
; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
77+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, double* [[IN]], i32 [[VEC_START1]]
78+
; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <4 x double>*
79+
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST3]], align 8
80+
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
81+
; CHECK-NEXT: ret <8 x double> [[TMP0]]
82+
;
83+
entry:
84+
%load = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(double* %in, i32 %stride, i1 false, i32 4, i32 2)
85+
ret <8 x double> %load
86+
}

0 commit comments

Comments
 (0)