[VectorCombine] Insert addrspacecast when crossing address space boundaries

frasercrmck · tstellar · commit 67a290460c37 · 2022-04-06T11:57:24.000-07:00
We can not bitcast pointers across different address spaces. This was previously fixed in D89577 but then in D93229 an enhancement was added which peeks further through the ponter operand, opening up the possibility that address-space violations could be introduced. Instead of bailing as the previous fix did, simply insert an addrspacecast cast instruction. Reviewed By: lebedev.ri Differential Revision: https://reviews.llvm.org/D121787 (cherry picked from commit 2e44b78)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -152,12 +152,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
 
-  // If original AS != Load's AS, we can't bitcast the original pointer and have
-  // to use Load's operand instead. Ideally we would want to strip pointer casts
-  // without changing AS, but there's no API to do that ATM.
   unsigned AS = Load->getPointerAddressSpace();
-  if (AS != SrcPtr->getType()->getPointerAddressSpace())
-    SrcPtr = Load->getPointerOperand();
 
   // We are potentially transforming byte-sized (8-bit) memory accesses, so make
   // sure we have all of our type-based constraints in place for this target.
@@ -245,7 +240,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // It is safe and potentially profitable to load a vector directly:
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
-  Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
+  Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      SrcPtr, MinVecTy->getPointerTo(AS));
   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
 
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
@@ -11,9 +11,7 @@ define protected amdgpu_kernel void @load_from_other_as(<4 x float>* nocapture n
 ; CHECK-LABEL: @load_from_other_as(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
-; CHECK-NEXT:    [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
@@ -11,9 +11,7 @@ define protected amdgpu_kernel void @load_from_other_as(<4 x float>* nocapture n
 ; CHECK-LABEL: @load_from_other_as(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
-; CHECK-NEXT:    [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -253,6 +253,23 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(
   ret <4 x float> %r
 }
 
+; Should work with addrspace even when peeking past unsafe loads through geps
+
+define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(i32* align 16 dereferenceable(16) %v3) {
+; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace(
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast i32* [[V3:%.*]] to <4 x i32> addrspace(42)*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(42)* [[TMP1]], align 16
+; CHECK-NEXT:    [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[INSELT]]
+;
+  %t0 = getelementptr inbounds i32, i32* %v3, i32 1
+  %t1 = addrspacecast i32* %t0 to i32 addrspace(42)*
+  %t2 = getelementptr inbounds i32, i32 addrspace(42)* %t1, i64 1
+  %val = load i32, i32 addrspace(42)* %t2, align 4
+  %inselt = insertelement <4 x i32> poison, i32 %val, i32 0
+  ret <4 x i32> %inselt
+}
+
 ; If there are enough dereferenceable bytes, we can offset the vector load.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {