-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GVN] Handle scalable vectors with the same size in VNCoercion #123984
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Green (davemgreen) ChangesThis allows us to forward to a load even if the types do not match (nxv4i32 vs nxv2i64 for example). Scalable types are allowed in canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is the same, and some follow-on code is adjusted to make sure it handles scalable sizes correctly. Methods like analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still do nothing for scalable vectors, as Offsets and mismatching types are not supported. Patch is 43.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123984.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 1e0ae280516410..b859feebe4ef9b 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
if (StoredTy == LoadTy)
return true;
+ if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(StoredTy) &&
+ StoredTy->getPrimitiveSizeInBits() == LoadTy->getPrimitiveSizeInBits())
+ return true;
+
// If the loaded/stored value is a first class array/struct, or scalable type,
// don't try to transform them. We need to be able to bitcast to integer.
if (isFirstClassAggregateOrScalableType(LoadTy) ||
@@ -83,8 +87,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
// If this is already the right type, just return it.
Type *StoredValTy = StoredVal->getType();
- uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedValue();
- uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedValue();
+ TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+ TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
// If the store and reload are the same size, we can always reuse it.
if (StoredValSize == LoadedValSize) {
@@ -118,7 +122,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
// If the loaded value is smaller than the available value, then we can
// extract out a piece from it. If the available value is too small, then we
// can't do anything.
- assert(StoredValSize >= LoadedValSize &&
+ assert(!StoredValSize.isScalable() && !LoadedValSize.isScalable() &&
+ StoredValSize >= LoadedValSize &&
"canCoerceMustAliasedValueToLoad fail");
// Convert source pointers to integers, which can be manipulated.
@@ -303,6 +308,13 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
return SrcVal;
}
+ // Return scalable values directly to avoid needing to bitcast to integer
+ // types, as we do not support non-zero Offsets.
+ if (isa<ScalableVectorType>(LoadTy)) {
+ assert(Offset == 0 && "Expected a zero offset for scalable types");
+ return SrcVal;
+ }
+
uint64_t StoreSize =
(DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -333,11 +345,15 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
Instruction *InsertPt, const DataLayout &DL) {
-
#ifndef NDEBUG
- unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()).getFixedValue();
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedValue();
- assert(Offset + LoadSize <= SrcValSize);
+ TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+ TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
+ assert(SrcValSize.isScalable() == LoadSize.isScalable());
+ assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
+ "Expected Offset + LoadSize <= SrcValSize");
+ assert(
+ (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
+ "Expected scalable type sizes to match");
#endif
IRBuilder<> Builder(InsertPt);
SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
index 71adaed8e5722b..928fd77aaa8baa 100644
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -387,3 +387,225 @@ if.then:
if.else:
ret void
}
+
+; Different sizes / types
+
+define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 16 x i8>
+; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %load = load <vscale x 16 x i8>, ptr %p
+ ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 4 x float>
+; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %load = load <vscale x 4 x float>, ptr %p
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load(
+; CHECK-NEXT: store <vscale x 16 x i8> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = bitcast <vscale x 16 x i8> [[X]] to <vscale x 4 x float>
+; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD]]
+;
+ store <vscale x 16 x i8> %x, ptr %p
+ %load = load <vscale x 4 x float>, ptr %p
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x i32> @load_v4i32_store_v4f32_forward_load(ptr %p, <vscale x 4 x float> %x) {
+; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load(
+; CHECK-NEXT: store <vscale x 4 x float> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = bitcast <vscale x 4 x float> [[X]] to <vscale x 4 x i32>
+; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]]
+;
+ store <vscale x 4 x float> %x, ptr %p
+ %load = load <vscale x 4 x i32>, ptr %p
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @load_v4i32_store_v4i64_forward_load(ptr %p, <vscale x 4 x i64> %x) {
+; CHECK-LABEL: @load_v4i32_store_v4i64_forward_load(
+; CHECK-NEXT: store <vscale x 4 x i64> [[X:%.*]], ptr [[P:%.*]], align 32
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]]
+;
+ store <vscale x 4 x i64> %x, ptr %p
+ %load = load <vscale x 4 x i32>, ptr %p
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i64> @load_v4i64_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v4i64_store_v4i32_forward_load(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[P]], align 32
+; CHECK-NEXT: ret <vscale x 4 x i64> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %load = load <vscale x 4 x i64>, ptr %p
+ ret <vscale x 4 x i64> %load
+}
+
+define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[P]], align 8
+; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %load = load <vscale x 2 x i32>, ptr %p
+ ret <vscale x 2 x i32> %load
+}
+
+define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsets(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsets(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[Q:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[P]], i64 1
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8
+; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %q = getelementptr <vscale x 2 x i32>, ptr %p, i64 1
+ %load = load <vscale x 2 x i32>, ptr %q
+ ret <vscale x 2 x i32> %load
+}
+
+define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsetc(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[Q:%.*]] = getelementptr <2 x i32>, ptr [[P]], i64 1
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8
+; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %q = getelementptr <2 x i32>, ptr %p, i64 1
+ %load = load <vscale x 2 x i32>, ptr %q
+ ret <vscale x 2 x i32> %load
+}
+
+define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load_offsetc(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load_offsetc(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[Q:%.*]] = getelementptr <2 x ptr>, ptr [[P]], i64 1
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[Q]], align 16
+; CHECK-NEXT: ret <vscale x 2 x ptr> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %q = getelementptr <2 x ptr>, ptr %p, i64 1
+ %load = load <vscale x 2 x ptr>, ptr %q
+ ret <vscale x 2 x ptr> %load
+}
+
+define <vscale x 16 x i8> @load_nxv16i8_store_v4i32_forward_load(ptr %p, <4 x i32> %x) {
+; CHECK-LABEL: @load_nxv16i8_store_v4i32_forward_load(
+; CHECK-NEXT: store <4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]]
+;
+ store <4 x i32> %x, ptr %p
+ %load = load <vscale x 16 x i8>, ptr %p
+ ret <vscale x 16 x i8> %load
+}
+
+define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_v16i8_store_nxv4i32_forward_load(
+; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT: ret <16 x i8> [[LOAD]]
+;
+ store <vscale x 4 x i32> %x, ptr %p
+ %load = load <16 x i8>, ptr %p
+ ret <16 x i8> %load
+}
+
+define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_constant(ptr %p) {
+; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant(
+; CHECK-NEXT: store <vscale x 4 x i32> splat (i32 4), ptr [[P:%.*]], align 16
+; CHECK-NEXT: ret <vscale x 16 x i8> bitcast (<vscale x 4 x i32> splat (i32 4) to <vscale x 16 x i8>)
+;
+ store <vscale x 4 x i32> splat (i32 4), ptr %p
+ %load = load <vscale x 16 x i8>, ptr %p
+ ret <vscale x 16 x i8> %load
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @bigexample({ <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a) vscale_range(1,16) {
+; CHECK-LABEL: @bigexample(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[REF_TMP:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]])
+; CHECK-NEXT: [[A_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A:%.*]], 0
+; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT]], ptr [[REF_TMP]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]]
+; CHECK-NEXT: [[A_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 1
+; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 5
+; CHECK-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]]
+; CHECK-NEXT: [[A_ELT4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 2
+; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 48
+; CHECK-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]]
+; CHECK-NEXT: [[A_ELT6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 3
+; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16
+; CHECK-NEXT: [[DOTUNPACK:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT]] to <vscale x 16 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[DOTUNPACK]], 0
+; CHECK-NEXT: [[DOTUNPACK8:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT2]] to <vscale x 16 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[DOTUNPACK8]], 1
+; CHECK-NEXT: [[DOTUNPACK10:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT4]] to <vscale x 16 x i8>
+; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[DOTUNPACK10]], 2
+; CHECK-NEXT: [[DOTUNPACK12:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT6]] to <vscale x 16 x i8>
+; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[DOTUNPACK12]], 3
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]])
+; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+;
+entry:
+ %ref.tmp = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
+ call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %ref.tmp)
+ %a.elt = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 0
+ store <vscale x 4 x i32> %a.elt, ptr %ref.tmp, align 16
+ %0 = call i64 @llvm.vscale.i64()
+ %1 = shl i64 %0, 4
+ %ref.tmp.repack1 = getelementptr inbounds i8, ptr %ref.tmp, i64 %1
+ %a.elt2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 1
+ store <vscale x 4 x i32> %a.elt2, ptr %ref.tmp.repack1, align 16
+ %2 = call i64 @llvm.vscale.i64()
+ %3 = shl i64 %2, 5
+ %ref.tmp.repack3 = getelementptr inbounds i8, ptr %ref.tmp, i64 %3
+ %a.elt4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 2
+ store <vscale x 4 x i32> %a.elt4, ptr %ref.tmp.repack3, align 16
+ %4 = call i64 @llvm.vscale.i64()
+ %5 = mul i64 %4, 48
+ %ref.tmp.repack5 = getelementptr inbounds i8, ptr %ref.tmp, i64 %5
+ %a.elt6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 3
+ store <vscale x 4 x i32> %a.elt6, ptr %ref.tmp.repack5, align 16
+ %.unpack = load <vscale x 16 x i8>, ptr %ref.tmp, align 16
+ %6 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %.unpack, 0
+ %7 = call i64 @llvm.vscale.i64()
+ %8 = shl i64 %7, 4
+ %.elt7 = getelementptr inbounds i8, ptr %ref.tmp, i64 %8
+ %.unpack8 = load <vscale x 16 x i8>, ptr %.elt7, align 16
+ %9 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, <vscale x 16 x i8> %.unpack8, 1
+ %10 = call i64 @llvm.vscale.i64()
+ %11 = shl i64 %10, 5
+ %.elt9 = getelementptr inbounds i8, ptr %ref.tmp, i64 %11
+ %.unpack10 = load <vscale x 16 x i8>, ptr %.elt9, align 16
+ %12 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %9, <vscale x 16 x i8> %.unpack10, 2
+ %13 = call i64 @llvm.vscale.i64()
+ %14 = mul i64 %13, 48
+ %.elt11 = getelementptr inbounds i8, ptr %ref.tmp, i64 %14
+ %.unpack12 = load <vscale x 16 x i8>, ptr %.elt11, align 16
+ %15 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %12, <vscale x 16 x i8> %.unpack12, 3
+ call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp)
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15
+}
diff --git a/llvm/test/Transforms/NewGVN/vscale.ll b/llvm/test/Transforms/NewGVN/vscale.ll
new file mode 100644
index 00000000000000..fcfe57bbf02826
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/vscale.ll
@@ -0,0 +1,617 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -passes=newgvn,dce | FileCheck %s
+
+; Analyze Load from clobbering Load.
+
+define <vscale x 4 x i32> @load_store_clobber_load(ptr %p) {
+; CHECK-LABEL: @load_store_clobber_load(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr undef, align 16
+; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+ %load1 = load <vscale x 4 x i32>, ptr %p
+ store <vscale x 4 x i32> zeroinitializer, ptr undef
+ %load2 = load <vscale x 4 x i32>, ptr %p ; <- load to be eliminated
+ %add = add <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_mayalias(ptr %p, ptr %p2) {
+; CHECK-LABEL: @load_store_clobber_load_mayalias(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[P2:%.*]], align 16
+; CHECK-NEXT: [[LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT: [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[SUB]]
+;
+ %load1 = load <vscale x 4 x i32>, ptr %p
+ store <vscale x 4 x i32> zeroinitializer, ptr %p2
+ %load2 = load <vscale x 4 x i32>, ptr %p
+ %sub = sub <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_noalias(ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: @load_store_clobber_load_noalias(
+; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[P2:%.*]], align 16
+; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+ %load1 = load <vscale x 4 x i32>, ptr %p
+ store <vscale x 4 x i32> zeroinitializer, ptr %p2
+ %load2 = load <vscale x 4 x i32>, ptr %p ; <- load to be eliminated
+ %add = add <vscale x 4 x i32> %load1, %load2
+ ret <vscale x 4 x i32> %add
+}
+
+; BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep1(ptr %p) {
+; CHECK-LABEL: @load_clobber_load_gep1(
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 0, i64 1
+ %load1 = load i32, ptr %gep1
+ %gep2 = getelementptr i32, ptr %p, i64 1
+ %load2 = load i32, ptr %gep2 ; <- load could be eliminated
+ %add = add i32 %load1, %load2
+ ret i32 %add
+}
+
+define i32 @load_clobber_load_gep2(ptr %p) {
+; CHECK-LABEL: @load_clobber_load_gep2(
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 4
+; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 0
+ %load1 = load i32, ptr %gep1
+ %gep2 = getelementptr i32, ptr %p, i64 4
+ %load2 = load i32, ptr %gep2 ; <- can not determine at compile-time if %load1 and %load2 are same addr
+ %add = add i32 %load1, %load2
+ ret i32 %add
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep3(ptr %p) {
+; CHECK-LABE...
[truncated]
|
✅ With the latest revision this PR passed the undef deprecator. |
@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, | |||
if (StoredTy == LoadTy) | |||
return true; | |||
|
|||
if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(StoredTy) && | |||
StoredTy->getPrimitiveSizeInBits() == LoadTy->getPrimitiveSizeInBits()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getPrimitiveSizeInBits doesn't support pointers, it's better to use DL. (In that case we should test proper generation of ptrtoint/inttoptr.)
@@ -118,7 +122,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, | |||
// If the loaded value is smaller than the available value, then we can | |||
// extract out a piece from it. If the available value is too small, then we | |||
// can't do anything. | |||
assert(StoredValSize >= LoadedValSize && | |||
assert(!StoredValSize.isScalable() && !LoadedValSize.isScalable() && | |||
StoredValSize >= LoadedValSize && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you use TypeSize::isKnownGE() here instead of the scalable checks?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The intent is to check that we do not see any scalable vectors at this point, they should have been handled above.
504d6cf
to
fda803c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, | |||
if (StoredTy == LoadTy) | |||
return true; | |||
|
|||
if (isa<ScalableVectorType>(StoredTy) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have to check LoadTy as well? To e.g. make sure it doesn't have an aggregate wrapper around a single scalable type?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - I added a couple of extra tests for it too.
This allows us to forward to a load even if the types do not match (nxv4i32 vs nxv2i64 for example). Scalable types are alloweed in canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is the same, and some follow-on code is adjusted to make sure it handles scalable sizes correctly. Methods like analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still do nothing for scalable vectors, as Offsets and mismatching types are not supported.
fda803c
to
7171d85
Compare
This allows us to forward to a load even if the types do not match (nxv4i32 vs nxv2i64 for example). Scalable types are allowed in canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is the same, and some follow-on code is adjusted to make sure it handles scalable sizes correctly. Methods like analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still do nothing for scalable vectors, as Offsets and mismatching types are not supported.