Improvements to RS4GC BDV Algorithm (#69795)

zduka · Petr Maj · web-flow · commit a1358225c5b7 · 2023-11-02T20:19:40.000-04:00
Previously, after the algorithm fixpointed, the state was manually
patched by emitting BDVs for EE instructions earlier, while marking some
(but not all) vector and vector&lt;-&gt;scalar instructions as conflict. This
causes issues as not all instructions that required BDVs had them
emitted and due to after-fixpoint patching, the extra BDVs did not
propagate to their users.

This change fixes both by rewriting the logic for BDV insertion &amp;
patching. Instead of inserting the BDV for EE earlier, it merely marks
every EE instruction as a conflict. The two phase insertion algorithm
(first insert empty instructions and patch the BDVState, then actually
connect the BDV instructions to their input bases) then ensures correct
propagation to all its users. Furthermore the shufflevector instruction
as well as all instances of IE instruction are conservatively marked as
conflicts as well, fixing the second problem.

This change does not fix the handling of constant values and vectors in
the BDV. 

---------

Co-authored-by: Petr Maj &lt;pmaj@azul.com&gt;
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -993,7 +993,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
         NewState.meet(OpState);
       });
 
-      BDVState OldState = States[BDV];
+      BDVState OldState = Pair.second;
       if (OldState != NewState) {
         Progress = true;
         States[BDV] = NewState;
@@ -1012,8 +1012,44 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
   }
 #endif
 
-  // Handle all instructions that have a vector BDV, but the instruction itself
-  // is of scalar type.
+  // Even though we have identified a concrete base (or a conflict) for all live
+  // pointers at this point, there are cases where the base is of an
+  // incompatible type compared to the original instruction. We conservatively
+  // mark those as conflicts to ensure that corresponding BDVs will be generated
+  // in the next steps.
+
+  // this is a rather explicit check for all cases where we should mark the
+  // state as a conflict to force the latter stages of the algorithm to emit
+  // the BDVs.
+  // TODO: in many cases the instructions emited for the conflicting states
+  // will be identical to the I itself (if the I's operate on their BDVs
+  // themselves). We should expoit this, but can't do it here since it would
+  // break the invariant about the BDVs not being known to be a base.
+  // TODO: the code also does not handle constants at all - the algorithm relies
+  // on all constants having the same BDV and therefore constant-only insns
+  // will never be in conflict, but this check is ignored here. If the
+  // constant conflicts will be to BDVs themselves, they will be identical
+  // instructions and will get optimized away (as in the above TODO)
+  auto MarkConflict = [&](Instruction *I, Value *BaseValue) {
+    // II and EE mixes vector & scalar so is always a conflict
+    if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I))
+      return true;
+    // Shuffle vector is always a conflict as it creates new vector from
+    // existing ones.
+    if (isa<ShuffleVectorInst>(I))
+      return true;
+    // Any  instructions where the computed base type differs from the
+    // instruction type. An example is where an extract instruction is used by a
+    // select. Here the select's BDV is a vector (because of extract's BDV),
+    // while the select itself is a scalar type. Note that the IE and EE
+    // instruction check is not fully subsumed by the vector<->scalar check at
+    // the end, this is due to the BDV algorithm being ignorant of BDV types at
+    // this junction.
+    if (!areBothVectorOrScalar(BaseValue, I))
+      return true;
+    return false;
+  };
+
   for (auto Pair : States) {
     Instruction *I = cast<Instruction>(Pair.first);
     BDVState State = Pair.second;
@@ -1026,30 +1062,13 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
         "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
 
-    if (!State.isBase() || !isa<VectorType>(BaseValue->getType()))
+    // since we only mark vec-scalar insns as conflicts in the pass, our work is
+    // done if the instruction already conflicts
+    if (State.isConflict())
       continue;
-    // extractelement instructions are a bit special in that we may need to
-    // insert an extract even when we know an exact base for the instruction.
-    // The problem is that we need to convert from a vector base to a scalar
-    // base for the particular indice we're interested in.
-    if (isa<ExtractElementInst>(I)) {
-      auto *EE = cast<ExtractElementInst>(I);
-      // TODO: In many cases, the new instruction is just EE itself.  We should
-      // exploit this, but can't do it here since it would break the invariant
-      // about the BDV not being known to be a base.
-      auto *BaseInst = ExtractElementInst::Create(
-          State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
-      BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
-      States[I] = BDVState(I, BDVState::Base, BaseInst);
-      setKnownBase(BaseInst, /* IsKnownBase */true, KnownBases);
-    } else if (!isa<VectorType>(I->getType())) {
-      // We need to handle cases that have a vector base but the instruction is
-      // a scalar type (these could be phis or selects or any instruction that
-      // are of scalar type, but the base can be a vector type).  We
-      // conservatively set this as conflict.  Setting the base value for these
-      // conflicts is handled in the next loop which traverses States.
+
+    if (MarkConflict(I, BaseValue))
       States[I] = BDVState(I, BDVState::Conflict);
-    }
   }
 
 #ifndef NDEBUG
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/base-vector.ll b/llvm/test/Transforms/RewriteStatepointsForGC/base-vector.ll
@@ -352,3 +352,36 @@ entry:
   call void @use_vec(<4 x ptr addrspace(1)> %vec)
   ret void
 }
+
+define i32 @test13() gc "statepoint-example" {
+; CHECK-LABEL: define i32 @test13() gc "statepoint-example" {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 948, i64 896>
+; CHECK-NEXT:    [[SHUFFLEVECTOR_BASE:%.*]] = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 3, i32 1, i32 2>, !is_base_value !0
+; CHECK-NEXT:    [[SHUFFLEVECTOR:%.*]] = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> [[GETELEMENTPTR]], <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHI_BASE:%.*]] = phi <4 x ptr addrspace(1)> [ [[SHUFFLEVECTOR_BASE]], [[BB:%.*]] ], [ zeroinitializer, [[BB1]] ], !is_base_value !0
+; CHECK-NEXT:    [[PHI:%.*]] = phi <4 x ptr addrspace(1)> [ [[SHUFFLEVECTOR]], [[BB]] ], [ zeroinitializer, [[BB1]] ]
+; CHECK-NEXT:    [[EXTRACTELEMENT_BASE:%.*]] = extractelement <4 x ptr addrspace(1)> [[PHI_BASE]], i32 0, !is_base_value !0
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(1)> [[PHI]], i32 0
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(i32 ()) @spam, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) [[EXTRACTELEMENT]], ptr addrspace(1) [[EXTRACTELEMENT_BASE]]) ]
+; CHECK-NEXT:    [[EXTRACTELEMENT_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 1, i32 0)
+; CHECK-NEXT:    [[EXTRACTELEMENT_BASE_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 1, i32 1)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[EXTRACTELEMENT_RELOCATED]], align 4
+; CHECK-NEXT:    br label [[BB1]]
+;
+bb:
+  %getelementptr = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 948, i64 896>
+  %shufflevector = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> %getelementptr, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %phi = phi <4 x ptr addrspace(1)> [ %shufflevector, %bb ], [ zeroinitializer, %bb1 ]
+  %extractelement = extractelement <4 x ptr addrspace(1)> %phi, i32 0
+  %call = call i32 @spam()
+  %load = load i32, ptr addrspace(1) %extractelement, align 4
+  br label %bb1
+}
+
+declare i32 @spam() gc "statepoint-example"
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/constants.ll b/llvm/test/Transforms/RewriteStatepointsForGC/constants.ll
@@ -262,3 +262,16 @@ entry:
   ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%val.base, %val)
   ret <2 x ptr addrspace(1)> %val
 }
+
+define ptr addrspace(1) @test14() gc "statepoint-example" {
+; CHECK-LABEL: @test14
+; FIXME: this is a extract element of constant vector and as such should not be relocated, but our current code relocates it since the extractelement will conservatively be marked as conflict during the BDV insertion algorithm. In many cases the extra instruction will be identical to the one already present and will be removed by later phases. 
+entry:
+  %val = extractelement <2 x ptr addrspace(1)> <ptr addrspace(1) inttoptr (i64 5 to ptr addrspace(1)),
+                                                ptr addrspace(1) inttoptr (i64 15 to ptr addrspace(1))>, i32 0
+  ; CHECK: gc.statepoint
+  call void @foo() [ "deopt"() ]
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%val.base, %val.base)
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%val.base, %val)
+  ret ptr addrspace(1) %val
+}
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/insert-extract.ll b/llvm/test/Transforms/RewriteStatepointsForGC/insert-extract.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S 2>&1 | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @barney() gc "statepoint-example" {
+; CHECK-LABEL: define void @barney() gc "statepoint-example" {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 16, i64 8>
+; CHECK-NEXT:    [[EXTRACTELEMENT_BASE:%.*]] = extractelement <2 x ptr addrspace(1)> zeroinitializer, i32 0, !is_base_value !0
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <2 x ptr addrspace(1)> [[GETELEMENTPTR]], i32 0
+; CHECK-NEXT:    [[INSERTELEMENT_BASE:%.*]] = insertelement <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) [[EXTRACTELEMENT_BASE]], i32 0, !is_base_value !0
+; CHECK-NEXT:    [[INSERTELEMENT:%.*]] = insertelement <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) [[EXTRACTELEMENT]], i32 0
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(i8 ()) @foo, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(<2 x ptr addrspace(1)> [[INSERTELEMENT]], <2 x ptr addrspace(1)> [[INSERTELEMENT_BASE]]) ]
+; CHECK-NEXT:    [[INSERTELEMENT_RELOCATED:%.*]] = call coldcc <2 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v2p1(token [[STATEPOINT_TOKEN]], i32 1, i32 0)
+; CHECK-NEXT:    [[INSERTELEMENT_BASE_RELOCATED:%.*]] = call coldcc <2 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v2p1(token [[STATEPOINT_TOKEN]], i32 1, i32 1)
+; CHECK-NEXT:    [[EXTRACTELEMENT1:%.*]] = extractelement <2 x ptr addrspace(1)> [[INSERTELEMENT_RELOCATED]], i32 0
+; CHECK-NEXT:    ret void
+;
+bb:
+  %getelementptr = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 16, i64 8>
+  %extractelement = extractelement <2 x ptr addrspace(1)> %getelementptr, i32 0
+  %insertelement = insertelement <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) %extractelement, i32 0
+  %call = call i8 @foo()
+  %extractelement1 = extractelement <2 x ptr addrspace(1)> %insertelement, i32 0
+  ret void
+}
+
+
+; same as above, but ensures that transitive uses of insertelement emit proper code as well
+define void @bart() gc "statepoint-example" {
+; CHECK-LABEL: define void @bart() gc "statepoint-example" {
+; CHECK-NEXT:  always_continue:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 16, i64 8>
+; CHECK-NEXT:    [[BASE_EE:%.*]] = extractelement <2 x ptr addrspace(1)> zeroinitializer, i32 0, !is_base_value !0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr addrspace(1)> [[TMP0]], i32 0
+; CHECK-NEXT:    [[BASE_IE:%.*]] = insertelement <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) [[BASE_EE]], i32 0, !is_base_value !0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) [[TMP1]], i32 0
+; CHECK-NEXT:    [[OTHER_BASE:%.*]] = insertelement <2 x ptr addrspace(1)> [[BASE_IE]], ptr addrspace(1) [[BASE_EE]], i32 0, !is_base_value !0
+; CHECK-NEXT:    [[OTHER:%.*]] = insertelement <2 x ptr addrspace(1)> [[TMP2]], ptr addrspace(1) [[TMP1]], i32 0
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(i8 ()) @foo, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(<2 x ptr addrspace(1)> [[OTHER]], <2 x ptr addrspace(1)> [[OTHER_BASE]]) ]
+; CHECK-NEXT:    [[OTHER_RELOCATED:%.*]] = call coldcc <2 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v2p1(token [[STATEPOINT_TOKEN]], i32 1, i32 0)
+; CHECK-NEXT:    [[OTHER_BASE_RELOCATED:%.*]] = call coldcc <2 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v2p1(token [[STATEPOINT_TOKEN]], i32 1, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr addrspace(1)> [[OTHER_RELOCATED]], i32 0
+; CHECK-NEXT:    ret void
+;
+always_continue:
+  %0 = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 16, i64 8>
+  %1 = extractelement <2 x ptr addrspace(1)> %0, i32 0
+  %2 = insertelement <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) %1, i32 0
+  %other = insertelement <2 x ptr addrspace(1)> %2, ptr addrspace(1) %1, i32 0
+  %3 = call i8 @foo()
+  %4 = extractelement <2 x ptr addrspace(1)> %other, i32 0
+  ret void
+}
+
+declare i8 @foo()
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/vector-bitcast.ll b/llvm/test/Transforms/RewriteStatepointsForGC/vector-bitcast.ll
@@ -14,7 +14,7 @@ define i32 @test() gc "statepoint-example" {
 entry:
 ; CHECK-LABEL: entry
 ; CHECK: %bc = bitcast
-; CHECK: %[[p1:[A-Za-z0-9_]+]] = extractelement
+; CHECK: %[[p1:[A-Za-z0-9_.]+]] = extractelement
 ; CHECK: %[[p2:[A-Za-z0-9_]+]] = extractelement
 ; CHECK: llvm.experimental.gc.statepoint
 ; CHECK: %[[p2]].relocated = {{.+}} @llvm.experimental.gc.relocate