[GVN] Handle scalable vectors with the same size in VNCoercion #123984

davemgreen · 2025-01-22T18:45:39Z

This allows us to forward to a load even if the types do not match (nxv4i32 vs nxv2i64 for example). Scalable types are allowed in canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is the same, and some follow-on code is adjusted to make sure it handles scalable sizes correctly. Methods like analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still do nothing for scalable vectors, as Offsets and mismatching types are not supported.

llvmbot · 2025-01-22T18:46:14Z

@llvm/pr-subscribers-llvm-transforms

Author: David Green (davemgreen)

Changes

This allows us to forward to a load even if the types do not match (nxv4i32 vs nxv2i64 for example). Scalable types are allowed in canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is the same, and some follow-on code is adjusted to make sure it handles scalable sizes correctly. Methods like analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still do nothing for scalable vectors, as Offsets and mismatching types are not supported.

Patch is 43.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123984.diff

3 Files Affected:

(modified) llvm/lib/Transforms/Utils/VNCoercion.cpp (+23-7)
(modified) llvm/test/Transforms/GVN/vscale.ll (+222)
(added) llvm/test/Transforms/NewGVN/vscale.ll (+617)

diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 1e0ae280516410..b859feebe4ef9b 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   if (StoredTy == LoadTy)
     return true;
 
+  if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(StoredTy) &&
+      StoredTy->getPrimitiveSizeInBits() == LoadTy->getPrimitiveSizeInBits())
+    return true;
+
   // If the loaded/stored value is a first class array/struct, or scalable type,
   // don't try to transform them. We need to be able to bitcast to integer.
   if (isFirstClassAggregateOrScalableType(LoadTy) ||
@@ -83,8 +87,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedValue();
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedValue();
+  TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
 
   // If the store and reload are the same size, we can always reuse it.
   if (StoredValSize == LoadedValSize) {
@@ -118,7 +122,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
   // If the loaded value is smaller than the available value, then we can
   // extract out a piece from it.  If the available value is too small, then we
   // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
+  assert(!StoredValSize.isScalable() && !LoadedValSize.isScalable() &&
+         StoredValSize >= LoadedValSize &&
          "canCoerceMustAliasedValueToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
@@ -303,6 +308,13 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
     return SrcVal;
   }
 
+  // Return scalable values directly to avoid needing to bitcast to integer
+  // types, as we do not support non-zero Offsets.
+  if (isa<ScalableVectorType>(LoadTy)) {
+    assert(Offset == 0 && "Expected a zero offset for scalable types");
+    return SrcVal;
+  }
+
   uint64_t StoreSize =
       (DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -333,11 +345,15 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
 
 Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
                        Instruction *InsertPt, const DataLayout &DL) {
-
 #ifndef NDEBUG
-  unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()).getFixedValue();
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedValue();
-  assert(Offset + LoadSize <= SrcValSize);
+  TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
+  assert(SrcValSize.isScalable() == LoadSize.isScalable());
+  assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
+         "Expected Offset + LoadSize <= SrcValSize");
+  assert(
+      (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
+      "Expected scalable type sizes to match");
 #endif
   IRBuilder<> Builder(InsertPt);
   SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
index 71adaed8e5722b..928fd77aaa8baa 100644
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -387,3 +387,225 @@ if.then:
 if.else:
   ret void
 }
+
+; Different sizes / types
+
+define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 16 x i8>
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %load = load <vscale x 16 x i8>, ptr %p
+  ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %load = load <vscale x 4 x float>, ptr %p
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale x 16 x i8> %x)  {
+; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load(
+; CHECK-NEXT:    store <vscale x 16 x i8> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 16 x i8> [[X]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+;
+  store <vscale x 16 x i8> %x, ptr %p
+  %load = load <vscale x 4 x float>, ptr %p
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x i32> @load_v4i32_store_v4f32_forward_load(ptr %p, <vscale x 4 x float> %x)  {
+; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load(
+; CHECK-NEXT:    store <vscale x 4 x float> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x float> [[X]] to <vscale x 4 x i32>
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[LOAD]]
+;
+  store <vscale x 4 x float> %x, ptr %p
+  %load = load <vscale x 4 x i32>, ptr %p
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @load_v4i32_store_v4i64_forward_load(ptr %p, <vscale x 4 x i64> %x)  {
+; CHECK-LABEL: @load_v4i32_store_v4i64_forward_load(
+; CHECK-NEXT:    store <vscale x 4 x i64> [[X:%.*]], ptr [[P:%.*]], align 32
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[LOAD]]
+;
+  store <vscale x 4 x i64> %x, ptr %p
+  %load = load <vscale x 4 x i32>, ptr %p
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i64> @load_v4i64_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v4i64_store_v4i32_forward_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[P]], align 32
+; CHECK-NEXT:    ret <vscale x 4 x i64> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %load = load <vscale x 4 x i64>, ptr %p
+  ret <vscale x 4 x i64> %load
+}
+
+define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[P]], align 8
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %load = load <vscale x 2 x i32>, ptr %p
+  ret <vscale x 2 x i32> %load
+}
+
+define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsets(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsets(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[P]], i64 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %q = getelementptr <vscale x 2 x i32>, ptr %p, i64 1
+  %load = load <vscale x 2 x i32>, ptr %q
+  ret <vscale x 2 x i32> %load
+}
+
+define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsetc(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr <2 x i32>, ptr [[P]], i64 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %q = getelementptr <2 x i32>, ptr %p, i64 1
+  %load = load <vscale x 2 x i32>, ptr %q
+  ret <vscale x 2 x i32> %load
+}
+
+define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load_offsetc(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load_offsetc(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr <2 x ptr>, ptr [[P]], i64 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[Q]], align 16
+; CHECK-NEXT:    ret <vscale x 2 x ptr> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %q = getelementptr <2 x ptr>, ptr %p, i64 1
+  %load = load <vscale x 2 x ptr>, ptr %q
+  ret <vscale x 2 x ptr> %load
+}
+
+define <vscale x 16 x i8> @load_nxv16i8_store_v4i32_forward_load(ptr %p, <4 x i32> %x)  {
+; CHECK-LABEL: @load_nxv16i8_store_v4i32_forward_load(
+; CHECK-NEXT:    store <4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[LOAD]]
+;
+  store <4 x i32> %x, ptr %p
+  %load = load <vscale x 16 x i8>, ptr %p
+  ret <vscale x 16 x i8> %load
+}
+
+define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
+; CHECK-LABEL: @load_v16i8_store_nxv4i32_forward_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
+;
+  store <vscale x 4 x i32> %x, ptr %p
+  %load = load <16 x i8>, ptr %p
+  ret <16 x i8> %load
+}
+
+define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_constant(ptr %p)  {
+; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant(
+; CHECK-NEXT:    store <vscale x 4 x i32> splat (i32 4), ptr [[P:%.*]], align 16
+; CHECK-NEXT:    ret <vscale x 16 x i8> bitcast (<vscale x 4 x i32> splat (i32 4) to <vscale x 16 x i8>)
+;
+  store <vscale x 4 x i32> splat (i32 4), ptr %p
+  %load = load <vscale x 16 x i8>, ptr %p
+  ret <vscale x 16 x i8> %load
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @bigexample({ <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a) vscale_range(1,16) {
+; CHECK-LABEL: @bigexample(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REF_TMP:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]])
+; CHECK-NEXT:    [[A_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A:%.*]], 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A_ELT]], ptr [[REF_TMP]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]]
+; CHECK-NEXT:    [[A_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP0]], 5
+; CHECK-NEXT:    [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]]
+; CHECK-NEXT:    [[A_ELT4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 2
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP0]], 48
+; CHECK-NEXT:    [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]]
+; CHECK-NEXT:    [[A_ELT6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 3
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16
+; CHECK-NEXT:    [[DOTUNPACK:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[DOTUNPACK]], 0
+; CHECK-NEXT:    [[DOTUNPACK8:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT2]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[DOTUNPACK8]], 1
+; CHECK-NEXT:    [[DOTUNPACK10:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT4]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[DOTUNPACK10]], 2
+; CHECK-NEXT:    [[DOTUNPACK12:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT6]] to <vscale x 16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[DOTUNPACK12]], 3
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]])
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+;
+entry:
+  %ref.tmp = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
+  call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %ref.tmp)
+  %a.elt = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 0
+  store <vscale x 4 x i32> %a.elt, ptr %ref.tmp, align 16
+  %0 = call i64 @llvm.vscale.i64()
+  %1 = shl i64 %0, 4
+  %ref.tmp.repack1 = getelementptr inbounds i8, ptr %ref.tmp, i64 %1
+  %a.elt2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 1
+  store <vscale x 4 x i32> %a.elt2, ptr %ref.tmp.repack1, align 16
+  %2 = call i64 @llvm.vscale.i64()
+  %3 = shl i64 %2, 5
+  %ref.tmp.repack3 = getelementptr inbounds i8, ptr %ref.tmp, i64 %3
+  %a.elt4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 2
+  store <vscale x 4 x i32> %a.elt4, ptr %ref.tmp.repack3, align 16
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = mul i64 %4, 48
+  %ref.tmp.repack5 = getelementptr inbounds i8, ptr %ref.tmp, i64 %5
+  %a.elt6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 3
+  store <vscale x 4 x i32> %a.elt6, ptr %ref.tmp.repack5, align 16
+  %.unpack = load <vscale x 16 x i8>, ptr %ref.tmp, align 16
+  %6 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %.unpack, 0
+  %7 = call i64 @llvm.vscale.i64()
+  %8 = shl i64 %7, 4
+  %.elt7 = getelementptr inbounds i8, ptr %ref.tmp, i64 %8
+  %.unpack8 = load <vscale x 16 x i8>, ptr %.elt7, align 16
+  %9 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, <vscale x 16 x i8> %.unpack8, 1
+  %10 = call i64 @llvm.vscale.i64()
+  %11 = shl i64 %10, 5
+  %.elt9 = getelementptr inbounds i8, ptr %ref.tmp, i64 %11
+  %.unpack10 = load <vscale x 16 x i8>, ptr %.elt9, align 16
+  %12 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %9, <vscale x 16 x i8> %.unpack10, 2
+  %13 = call i64 @llvm.vscale.i64()
+  %14 = mul i64 %13, 48
+  %.elt11 = getelementptr inbounds i8, ptr %ref.tmp, i64 %14
+  %.unpack12 = load <vscale x 16 x i8>, ptr %.elt11, align 16
+  %15 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %12, <vscale x 16 x i8> %.unpack12, 3
+  call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15
+}
diff --git a/llvm/test/Transforms/NewGVN/vscale.ll b/llvm/test/Transforms/NewGVN/vscale.ll
new file mode 100644
index 00000000000000..fcfe57bbf02826
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/vscale.ll
@@ -0,0 +1,617 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -passes=newgvn,dce | FileCheck %s
+
+; Analyze Load from clobbering Load.
+
+define <vscale x 4 x i32> @load_store_clobber_load(ptr %p)  {
+; CHECK-LABEL: @load_store_clobber_load(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr undef, align 16
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, ptr %p
+  store <vscale x 4 x i32> zeroinitializer, ptr undef
+  %load2 = load <vscale x 4 x i32>, ptr %p ; <- load to be eliminated
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_mayalias(ptr %p, ptr %p2) {
+; CHECK-LABEL: @load_store_clobber_load_mayalias(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P2:%.*]], align 16
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SUB]]
+;
+  %load1 = load <vscale x 4 x i32>, ptr %p
+  store <vscale x 4 x i32> zeroinitializer, ptr %p2
+  %load2 = load <vscale x 4 x i32>, ptr %p
+  %sub = sub <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_noalias(ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: @load_store_clobber_load_noalias(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P2:%.*]], align 16
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, ptr %p
+  store <vscale x 4 x i32> zeroinitializer, ptr %p2
+  %load2 = load <vscale x 4 x i32>, ptr %p ; <- load to be eliminated
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+; BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep1(ptr %p) {
+; CHECK-LABEL: @load_clobber_load_gep1(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 0, i64 1
+  %load1 = load i32, ptr %gep1
+  %gep2 = getelementptr i32, ptr %p, i64 1
+  %load2 = load i32, ptr %gep2 ; <- load could be eliminated
+  %add = add i32 %load1, %load2
+  ret i32 %add
+}
+
+define i32 @load_clobber_load_gep2(ptr %p) {
+; CHECK-LABEL: @load_clobber_load_gep2(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 4
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 0
+  %load1 = load i32, ptr %gep1
+  %gep2 = getelementptr i32, ptr %p, i64 4
+  %load2 = load i32, ptr %gep2 ; <- can not determine at compile-time if %load1 and %load2 are same addr
+  %add = add i32 %load1, %load2
+  ret i32 %add
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep3(ptr %p) {
+; CHECK-LABE...
[truncated]

github-actions · 2025-01-22T18:48:55Z

✅ With the latest revision this PR passed the undef deprecator.

nikic · 2025-01-22T20:13:01Z

llvm/lib/Transforms/Utils/VNCoercion.cpp

@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
  if (StoredTy == LoadTy)
    return true;

+  if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(StoredTy) &&
+      StoredTy->getPrimitiveSizeInBits() == LoadTy->getPrimitiveSizeInBits())


getPrimitiveSizeInBits doesn't support pointers, it's better to use DL. (In that case we should test proper generation of ptrtoint/inttoptr.)

nikic · 2025-01-22T20:14:44Z

llvm/lib/Transforms/Utils/VNCoercion.cpp

@@ -118,7 +122,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
  // If the loaded value is smaller than the available value, then we can
  // extract out a piece from it.  If the available value is too small, then we
  // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
+  assert(!StoredValSize.isScalable() && !LoadedValSize.isScalable() &&
+         StoredValSize >= LoadedValSize &&


Can you use TypeSize::isKnownGE() here instead of the scalable checks?

The intent is to check that we do not see any scalable vectors at this point, they should have been handled above.

nikic

LGTM, thanks!

nikic · 2025-01-23T08:15:28Z

llvm/lib/Transforms/Utils/VNCoercion.cpp

@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
  if (StoredTy == LoadTy)
    return true;

+  if (isa<ScalableVectorType>(StoredTy) &&


Do we have to check LoadTy as well? To e.g. make sure it doesn't have an aggregate wrapper around a single scalable type?

Thanks - I added a couple of extra tests for it too.

This allows us to forward to a load even if the types do not match (nxv4i32 vs nxv2i64 for example). Scalable types are alloweed in canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is the same, and some follow-on code is adjusted to make sure it handles scalable sizes correctly. Methods like analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still do nothing for scalable vectors, as Offsets and mismatching types are not supported.

davemgreen requested a review from nikic January 22, 2025 18:45

llvmbot added the llvm:transforms label Jan 22, 2025

davemgreen requested a review from efriedma-quic January 22, 2025 18:47

nikic reviewed Jan 22, 2025

View reviewed changes

davemgreen force-pushed the gh-gvn-scalablecoerce branch from 504d6cf to fda803c Compare January 23, 2025 08:06

nikic approved these changes Jan 23, 2025

View reviewed changes

davemgreen force-pushed the gh-gvn-scalablecoerce branch from fda803c to 7171d85 Compare January 23, 2025 17:59

davemgreen merged commit 775d0f3 into llvm:main Jan 23, 2025
5 of 7 checks passed

davemgreen deleted the gh-gvn-scalablecoerce branch January 23, 2025 18:43

This was referenced Jan 23, 2025

[InstCombine] Allow load to store forwarding for scalable structs #123908

Closed

[GVN] Load-store forwaring of scalable store to fixed load. #124748

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[GVN] Handle scalable vectors with the same size in VNCoercion #123984

[GVN] Handle scalable vectors with the same size in VNCoercion #123984

Uh oh!

davemgreen commented Jan 22, 2025

Uh oh!

llvmbot commented Jan 22, 2025

Uh oh!

github-actions bot commented Jan 22, 2025 •

edited

Loading

Uh oh!

nikic Jan 22, 2025

Uh oh!

nikic Jan 22, 2025

Uh oh!

davemgreen Jan 23, 2025

Uh oh!

nikic left a comment

Uh oh!

nikic Jan 23, 2025

Uh oh!

davemgreen Jan 23, 2025

Uh oh!

Uh oh!

Uh oh!

[GVN] Handle scalable vectors with the same size in VNCoercion #123984

[GVN] Handle scalable vectors with the same size in VNCoercion #123984

Uh oh!

Conversation

davemgreen commented Jan 22, 2025

Uh oh!

llvmbot commented Jan 22, 2025

Uh oh!

github-actions bot commented Jan 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nikic Jan 22, 2025

Choose a reason for hiding this comment

Uh oh!

nikic Jan 22, 2025

Choose a reason for hiding this comment

Uh oh!

davemgreen Jan 23, 2025

Choose a reason for hiding this comment

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

nikic Jan 23, 2025

Choose a reason for hiding this comment

Uh oh!

davemgreen Jan 23, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Jan 22, 2025 •

edited

Loading