[Matrix] Overload stride arg in matrix.columnwise.load/store.

fhahn · fhahn · commit a1ef81de35a4 · 2021-08-12T10:45:25.000+01:00
This patch adjusts the intrinsics definition of llvm.matrix.column.major.load and llvm.matrix.column.major.store to allow overloading the type of the stride. The bitwidth of the stride is used to perform the offset computation. This fixes a crash when using __builtin_matrix_column_major_load or __builtin_matrix_column_major_store on 32 bit platforms. The stride argument of the builtins are defined as `size_t`, which is 32 bits wide on 32 bit platforms. Note that we still perform offset computations with 64 bit width on 32 bit platforms for accesses that do not take a user-specified stride. This can be fixed separately. Fixes PR51304. Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D107349
diff --git a/clang/test/CodeGen/matrix-type-builtins.c b/clang/test/CodeGen/matrix-type-builtins.c
diff --git a/clang/test/CodeGenCXX/matrix-type-builtins.cpp b/clang/test/CodeGenCXX/matrix-type-builtins.cpp
@@ -94,7 +94,7 @@ void test_column_major_load_with_stride_template_double(double *Ptr) {
 
   // CHECK-LABEL:  define linkonce_odr <40 x double> @_Z29column_major_load_with_strideIdLj10ELj4ELj15EEu11matrix_typeIXT0_EXT1_ET_EPS0_(double* %Ptr)
   // CHECK:         [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
-  // CHECK-NEXT:    call <40 x double> @llvm.matrix.column.major.load.v40f64(double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
+  // CHECK-NEXT:    call <40 x double> @llvm.matrix.column.major.load.v40f64.i64(double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
 
   matrix_t<double, 10, 4> M1 = column_major_load_with_stride<double, 10, 4, 15>(Ptr);
 }
@@ -106,7 +106,7 @@ void test_column_major_load_with_stride_template_int(int *Ptr) {
 
   // CHECK-LABEL: define linkonce_odr <6 x i32> @_Z29column_major_load_with_strideIiLj3ELj2ELj12EEu11matrix_typeIXT0_EXT1_ET_EPS0_(i32* %Ptr)
   // CHECK:         [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
-  // CHECK-NEXT:    call <6 x i32> @llvm.matrix.column.major.load.v6i32(i32* align 4 [[PTR]], i64 12, i1 false, i32 3, i32 2)
+  // CHECK-NEXT:    call <6 x i32> @llvm.matrix.column.major.load.v6i32.i64(i32* align 4 [[PTR]], i64 12, i1 false, i32 3, i32 2)
 
   matrix_t<int, 3, 2> M1 = column_major_load_with_stride<int, 3, 2, 12>(Ptr);
 }
@@ -124,7 +124,7 @@ void test_column_major_load_stride_wrapper(int *Ptr, UnsignedWrapper &W) {
   // CHECK-NEXT:    [[STRIDE:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} [[W]])
   // CHECK-NEXT:    [[STRIDE_EXT:%.*]] = zext i32 [[STRIDE]] to i64
   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
-  // CHECK-NEXT:    call <4 x i32> @llvm.matrix.column.major.load.v4i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
+  // CHECK-NEXT:    call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
   matrix_t<int, 2, 2> M1 = __builtin_matrix_column_major_load(Ptr, 2, 2, W);
 }
 
@@ -133,7 +133,7 @@ constexpr int constexpr3() { return 3; }
 void test_column_major_load_constexpr_num_rows(int *Ptr) {
   // CHECK-LABEL: define{{.*}} void @_Z41test_column_major_load_constexpr_num_rowsPi(i32* %Ptr)
   // CHECK:         [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
-  // CHECK-NEXT:    call <6 x i32> @llvm.matrix.column.major.load.v6i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
+  // CHECK-NEXT:    call <6 x i32> @llvm.matrix.column.major.load.v6i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
 
   matrix_t<int, 3, 2> M1 = __builtin_matrix_column_major_load(Ptr, constexpr3(), 2, 3);
 }
@@ -143,7 +143,7 @@ constexpr int constexpr1() { return 1; }
 void test_column_major_load_constexpr_num_columns(int *Ptr) {
   // CHECK-LABEL: define{{.*}} void @_Z44test_column_major_load_constexpr_num_columnsPi(i32* %Ptr)
   // CHECK:         [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
-  // CHECK-NEXT:    call <2 x i32> @llvm.matrix.column.major.load.v2i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 1)
+  // CHECK-NEXT:    call <2 x i32> @llvm.matrix.column.major.load.v2i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 1)
   matrix_t<int, 2, 1> M1 = __builtin_matrix_column_major_load(Ptr, 2, constexpr1(), 3);
 }
 
@@ -153,7 +153,7 @@ constexpr int constexpr_plus1() { return N + 1; }
 void test_column_major_load_constexpr_num_columns_temp(int *Ptr) {
   // CHECK-LABEL:  define{{.*}} void @_Z49test_column_major_load_constexpr_num_columns_tempPi(i32* %Ptr)
   // CHECK:         [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
-  // CHECK-NEXT:    call <10 x i32> @llvm.matrix.column.major.load.v10i32(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 5)
+  // CHECK-NEXT:    call <10 x i32> @llvm.matrix.column.major.load.v10i32.i64(i32* align 4 [[PTR]], i64 3, i1 false, i32 2, i32 5)
   matrix_t<int, 2, 5> M1 = __builtin_matrix_column_major_load(Ptr, 2, constexpr_plus1<4>(), 3);
 }
 
@@ -162,7 +162,7 @@ void test_column_major_load_constexpr_stride_constexpr(int *Ptr) {
   // CHECK:         [[STRIDE:%.*]] = call i32 @_Z10constexpr3v()
   // CHECK-NEXT:    [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
-  // CHECK-NEXT:    call <4 x i32> @llvm.matrix.column.major.load.v4i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
+  // CHECK-NEXT:    call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 2, i32 2)
 
   matrix_t<int, 2, 2> M1 = __builtin_matrix_column_major_load(Ptr, 2, 2, constexpr3());
 }
@@ -200,7 +200,7 @@ void test_column_major_store_with_stride_template_double(double *Ptr) {
   // CHECK-LABEL:  define linkonce_odr void @_Z30column_major_store_with_strideIdLj10ELj4ELj15EEvRu11matrix_typeIXT0_EXT1_ET_EPS0_([40 x double]* nonnull align 8 dereferenceable(320) %m, double* %Ptr)
   // CHECK:         [[M:%.*]] = load <40 x double>, <40 x double>* {{.*}}, align 8
   // CHECK-NEXT:    [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
-  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v40f64(<40 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
+  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v40f64.i64(<40 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 10, i32 4)
 
   matrix_t<double, 10, 4> M1;
   column_major_store_with_stride<double, 10, 4, 15>(M1, Ptr);
@@ -214,7 +214,7 @@ void test_column_major_store_with_stride_template_int(int *Ptr) {
   // CHECK-LABEL:  define linkonce_odr void @_Z30column_major_store_with_strideIiLj3ELj2ELj3EEvRu11matrix_typeIXT0_EXT1_ET_EPS0_([6 x i32]* nonnull align 4 dereferenceable(24) %m, i32* %Ptr)
   // CHECK:         [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
-  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v6i32(<6 x i32> [[M]], i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
+  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v6i32.i64(<6 x i32> [[M]], i32* align 4 [[PTR]], i64 3, i1 false, i32 3, i32 2)
 
   matrix_t<int, 3, 2> M1;
   column_major_store_with_stride<int, 3, 2, 3>(M1, Ptr);
@@ -227,7 +227,7 @@ void test_column_major_store_stride_wrapper(int *Ptr, UnsignedWrapper &W) {
   // CHECK-NEXT:    [[W:%.*]] = load %struct.UnsignedWrapper*, %struct.UnsignedWrapper** %W.addr, align 8
   // CHECK-NEXT:    [[IDX:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} [[W]])
   // CHECK-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[IDX]] to i64
-  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v4i32(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
+  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v4i32.i64(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
 
   matrix_t<int, 2, 2> M1;
   __builtin_matrix_column_major_store(M1, Ptr, W);
@@ -239,7 +239,7 @@ void test_column_major_store_constexpr_stride_constexpr(int *Ptr) {
   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
   // CHECK-NEXT:    [[IDX:%.*]] = call i32 @_Z10constexpr3v()
   // CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[IDX]] to i64
-  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v4i32(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
+  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v4i32.i64(<4 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 2, i32 2)
 
   matrix_t<int, 2, 2> M;
   __builtin_matrix_column_major_store(M, Ptr, constexpr3());
diff --git a/clang/test/CodeGenObjC/matrix-type-builtins.m b/clang/test/CodeGenObjC/matrix-type-builtins.m
@@ -56,7 +56,7 @@ void test_column_major_load(PtrValue *Ptr, IntValue *Stride) {
   // CHECK:         [[STRIDE:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)
   // CHECK-NEXT:    [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
   // CHECK:         [[PTR:%.*]] = call i32* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32* (i8*, i8*)*)
-  // CHECK-NEXT:    call <12 x i32> @llvm.matrix.column.major.load.v12i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 3, i32 4)
+  // CHECK-NEXT:    call <12 x i32> @llvm.matrix.column.major.load.v12i32.i64(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 3, i32 4)
 
   u3x4 m = __builtin_matrix_column_major_load(Ptr.value, 3, 4, Stride.value);
 }
@@ -67,7 +67,7 @@ void test_column_major_store(UnsignedMatrixValue *M, PtrValue *Ptr, IntValue *St
   // CHECK:         [[PTR:%.*]] = call i32* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32* (i8*, i8*)*)
   // CHECK:         [[IDX:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)
   // CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[IDX]] to i64
-  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v12i32(<12 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 3, i32 4)
+  // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v12i32.i64(<12 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX_EXT]], i1 false, i32 3, i32 4)
 
   __builtin_matrix_column_major_store(M.value, Ptr.value, Stride.value);
 }
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -17252,11 +17252,12 @@ Overview:
 
 The '``llvm.matrix.column.major.load.*``' intrinsics load a ``<Rows> x <Cols>``
 matrix using a stride of ``%Stride`` to compute the start address of the
-different columns.  This allows for convenient loading of sub matrixes. If
-``<IsVolatile>`` is true, the intrinsic is considered a :ref:`volatile memory
-access <volatile>`. The result matrix is returned in the result vector. If the
-``%Ptr`` argument is known to be aligned to some boundary, this can be
-specified as an attribute on the argument.
+different columns.  The offset is computed using ``%Stride``'s bitwidth. This
+allows for convenient loading of sub matrixes. If ``<IsVolatile>`` is true, the
+intrinsic is considered a :ref:`volatile memory access <volatile>`. The result
+matrix is returned in the result vector. If the ``%Ptr`` argument is known to
+be aligned to some boundary, this can be specified as an attribute on the
+argument.
 
 Arguments:
 """"""""""
@@ -17291,7 +17292,8 @@ Overview:
 
 The '``llvm.matrix.column.major.store.*``' intrinsics store the ``<Rows> x
 <Cols>`` matrix in ``%In`` to memory using a stride of ``%Stride`` between
-columns.  If ``<IsVolatile>`` is true, the intrinsic is considered a
+columns. The offset is computed using ``%Stride``'s bitwidth. If
+``<IsVolatile>`` is true, the intrinsic is considered a
 :ref:`volatile memory access <volatile>`.
 
 If the ``%Ptr`` argument is known to be aligned to some boundary, this can be
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
@@ -1668,7 +1668,7 @@ def int_matrix_multiply
 
 def int_matrix_column_major_load
   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-              [LLVMPointerToElt<0>, llvm_i64_ty, llvm_i1_ty,
+              [LLVMPointerToElt<0>, llvm_anyint_ty, llvm_i1_ty,
                llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrReadMem,
                NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
@@ -1677,7 +1677,7 @@ def int_matrix_column_major_load
 def int_matrix_column_major_store
   : DefaultAttrsIntrinsic<[],
               [llvm_anyvector_ty, LLVMPointerToElt<0>,
-               llvm_i64_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
+               llvm_anyint_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrWriteMem,
                WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
                ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -74,7 +74,7 @@ template <class IRBuilderTy> class MatrixBuilder {
 
     Value *Ops[] = {DataPtr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows),
                     B.getInt32(Columns)};
-    Type *OverloadedTypes[] = {RetType};
+    Type *OverloadedTypes[] = {RetType, Stride->getType()};
 
     Function *TheFn = Intrinsic::getDeclaration(
         getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes);
@@ -97,7 +97,7 @@ template <class IRBuilderTy> class MatrixBuilder {
     Value *Ops[] = {Matrix,           Ptr,
                     Stride,           B.getInt1(IsVolatile),
                     B.getInt32(Rows), B.getInt32(Columns)};
-    Type *OverloadedTypes[] = {Matrix->getType()};
+    Type *OverloadedTypes[] = {Matrix->getType(), Stride->getType()};
 
     Function *TheFn = Intrinsic::getDeclaration(
         getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes);
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -981,8 +981,9 @@ class LowerMatrixIntrinsics {
     Value *EltPtr = createElementPtr(Ptr, EltTy, Builder);
     MatrixTy Result;
     for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
-                                     Shape.getStride(), EltTy, Builder);
+      Value *GEP = computeVectorAddr(
+          EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),
+          Stride, Shape.getStride(), EltTy, Builder);
       Value *Vector = Builder.CreateAlignedLoad(
           VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign),
           IsVolatile, "col.load");
@@ -1071,9 +1072,11 @@ class LowerMatrixIntrinsics {
     auto VType = cast<VectorType>(Ty);
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
     for (auto Vec : enumerate(StoreVal.vectors())) {
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
-                                     Stride, StoreVal.getStride(),
-                                     VType->getElementType(), Builder);
+      Value *GEP = computeVectorAddr(
+          EltPtr,
+          Builder.getIntN(Stride->getType()->getScalarSizeInBits(),
+                          Vec.index()),
+          Stride, StoreVal.getStride(), VType->getElementType(), Builder);
       Builder.CreateAlignedStore(Vec.value(), GEP,
                                  getAlignForIndex(Vec.index(), Stride,
                                                   VType->getElementType(),
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
@@ -23,11 +23,11 @@ define <9 x double> @strided_load_3x3(double* %in, i64 %stride) {
 ; CHECK-NEXT:    ret <9 x double> [[TMP2]]
 ;
 entry:
-  %load = call <9 x double> @llvm.matrix.column.major.load(double* %in, i64 %stride, i1 false, i32 3, i32 3)
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double* %in, i64 %stride, i1 false, i32 3, i32 3)
   ret <9 x double> %load
 }
 
-declare <9 x double> @llvm.matrix.column.major.load(double*, i64, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double*, i64, i1, i32, i32)
 
 define <9 x double> @strided_load_9x1(double* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_9x1(
@@ -39,12 +39,11 @@ define <9 x double> @strided_load_9x1(double* %in, i64 %stride) {
 ; CHECK-NEXT:    ret <9 x double> [[COL_LOAD]]
 ;
 entry:
-  %load = call <9 x double> @llvm.matrix.column.major.load(double* %in, i64 %stride, i1 false, i32 9, i32 1)
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(double* %in, i64 %stride, i1 false, i32 9, i32 1)
   ret <9 x double> %load
 }
 
-declare <8 x double> @llvm.matrix.column.major.load.v8f64(double*, i64, i1, i32, i32)
-; CHECK: declare <8 x double> @llvm.matrix.column.major.load.v8f64(double* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY:#[0-9]]]
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i64(double*, i64, i1, i32, i32)
 
 define <8 x double> @strided_load_4x2(double* %in, i64 %stride) {
 ; CHECK-LABEL: @strided_load_4x2(
@@ -61,9 +60,27 @@ define <8 x double> @strided_load_4x2(double* %in, i64 %stride) {
 ; CHECK-NEXT:    ret <8 x double> [[TMP0]]
 ;
 entry:
-  %load = call <8 x double> @llvm.matrix.column.major.load.v8f64(double* %in, i64 %stride, i1 false, i32 4, i32 2)
+  %load = call <8 x double> @llvm.matrix.column.major.load.v8f64.i64(double* %in, i64 %stride, i1 false, i32 4, i32 2)
   ret <8 x double> %load
 }
 
-; CHECK: declare <9 x double> @llvm.matrix.column.major.load.v9f64(double* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) [[READONLY]]
-; CHECK: attributes [[READONLY]] = { argmemonly nofree nosync nounwind readonly willreturn }
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(double*, i32, i1, i32, i32)
+
+define <8 x double> @strided_load_4x2_stride_i32(double* %in, i32 %stride) {
+; CHECK-LABEL: @strided_load_4x2_stride_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[IN:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <4 x double>*
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST]], align 8
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* [[IN]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <4 x double>*
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <4 x double>, <4 x double>* [[VEC_CAST3]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[TMP0]]
+;
+entry:
+  %load = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(double* %in, i32 %stride, i1 false, i32 4, i32 2)
+  ret <8 x double> %load
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll