[NFCi][MergeFunctions] Consolidate Hashing Functions

boomanaiden154 · boomanaiden154 · commit 28134a29fded · 2023-08-18T13:15:12.000-07:00
A couple years ago, StructuralHash was created, copying the exact hashing implementation from FunctionComparator (minus a couple small details/refactorings). Since then, the hashing implementation has not diverged, but several other areas, like unit testing, have diverged significantly, with StructuralHash getting more attention in these areas. This patch aims to consolidate the two hashing functions into StructuralHash given they do the exact same thing and having less divergence in areas like unit testing would be beneficial. The original aim at creating a separate StructuralHash was to make the implementation divergent and capture additional details like instruction operands (which neither hashing implementation does currently). The MergeFunctions pass doesn't need these detaisl, but verification of pass return values would benefit from this additional data. Setting an option to calculate these values would allow for divergent behavior where appropriate while reducing code duplication with little runtime overhead. Reviewed By: aeubanks Differential Revision: https://reviews.llvm.org/D158217
diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
@@ -21,8 +21,10 @@ namespace llvm {
 class Function;
 class Module;
 
-uint64_t StructuralHash(const Function &F);
-uint64_t StructuralHash(const Module &M);
+using IRHash = uint64_t;
+
+IRHash StructuralHash(const Function &F);
+IRHash StructuralHash(const Module &M);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -99,11 +99,6 @@ class FunctionComparator {
   /// Test whether the two functions have equivalent behaviour.
   int compare();
 
-  /// Hash a function. Equivalent functions will have the same hash, and unequal
-  /// functions will have different hashes with high probability.
-  using FunctionHash = uint64_t;
-  static FunctionHash functionHash(Function &);
-
 protected:
   /// Start the comparison.
   void beginCompare() {
diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
@@ -27,24 +27,47 @@ class StructuralHashImpl {
 public:
   StructuralHashImpl() : Hash(4) {}
 
+  // A function hash is calculated by considering only the number of arguments
+  // and whether a function is varargs, the order of basic blocks (given by the
+  // successors of each basic block in depth first order), and the order of
+  // opcodes of each instruction within each of these basic blocks. This mirrors
+  // the strategy FunctionComparator::compare() uses to compare functions by
+  // walking the BBs in depth first order and comparing each instruction in
+  // sequence. Because this hash currently does not look at the operands, it is
+  // insensitive to things such as the target of calls and the constants used in
+  // the function, which makes it useful when possibly merging functions which
+  // are the same modulo constants and call targets.
+  //
+  // Note that different users of StructuralHash will want different behavior
+  // out of it (i.e., MergeFunctions will want something different from PM
+  // expensive checks for pass modification status). When modifying this
+  // function, most changes should be gated behind an option and enabled
+  // selectively.
   void update(const Function &F) {
     // Declarations don't affect analyses.
     if (F.isDeclaration())
       return;
 
-    hash(12345); // Function header
+    hash(0x6acaa36bef8325c5ULL); // Function header
 
     hash(F.isVarArg());
     hash(F.arg_size());
 
     SmallVector<const BasicBlock *, 8> BBs;
     SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
 
+    // Walk the blocks in the same order as
+    // FunctionComparator::cmpBasicBlocks(), accumulating the hash of the
+    // function "structure." (BB and opcode sequence)
     BBs.push_back(&F.getEntryBlock());
     VisitedBBs.insert(BBs[0]);
     while (!BBs.empty()) {
       const BasicBlock *BB = BBs.pop_back_val();
-      hash(45798); // Block header
+
+      // This random value acts as a block header, as otherwise the partition of
+      // opcodes into BBs wouldn't affect the hash, only the order of the
+      // opcodes
+      hash(45798);
       for (auto &Inst : *BB)
         hash(Inst.getOpcode());
 
@@ -79,13 +102,13 @@ class StructuralHashImpl {
 
 } // namespace
 
-uint64_t llvm::StructuralHash(const Function &F) {
+IRHash llvm::StructuralHash(const Function &F) {
   StructuralHashImpl H;
   H.update(F);
   return H.getHash();
 }
 
-uint64_t llvm::StructuralHash(const Module &M) {
+IRHash llvm::StructuralHash(const Module &M) {
   StructuralHashImpl H;
   H.update(M);
   return H.getHash();
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -107,6 +107,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -171,15 +172,14 @@ namespace {
 
 class FunctionNode {
   mutable AssertingVH<Function> F;
-  FunctionComparator::FunctionHash Hash;
+  IRHash Hash;
 
 public:
   // Note the hash is recalculated potentially multiple times, but it is cheap.
-  FunctionNode(Function *F)
-    : F(F), Hash(FunctionComparator::functionHash(*F))  {}
+  FunctionNode(Function *F) : F(F), Hash(StructuralHash(*F)) {}
 
   Function *getFunc() const { return F; }
-  FunctionComparator::FunctionHash getHash() const { return Hash; }
+  IRHash getHash() const { return Hash; }
 
   /// Replace the reference to the function F by the function G, assuming their
   /// implementations are equal.
@@ -390,11 +390,10 @@ bool MergeFunctions::runOnModule(Module &M) {
 
   // All functions in the module, ordered by hash. Functions with a unique
   // hash value are easily eliminated.
-  std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
-    HashedFuncs;
+  std::vector<std::pair<IRHash, Function *>> HashedFuncs;
   for (Function &Func : M) {
     if (isEligibleForMerging(Func)) {
-      HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func});
+      HashedFuncs.push_back({StructuralHash(Func), &Func});
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -958,67 +958,3 @@ int FunctionComparator::compare() {
   }
   return 0;
 }
-
-namespace {
-
-// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
-// hash of a sequence of 64bit ints, but the entire input does not need to be
-// available at once. This interface is necessary for functionHash because it
-// needs to accumulate the hash as the structure of the function is traversed
-// without saving these values to an intermediate buffer. This form of hashing
-// is not often needed, as usually the object to hash is just read from a
-// buffer.
-class HashAccumulator64 {
-  uint64_t Hash;
-
-public:
-  // Initialize to random constant, so the state isn't zero.
-  HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
-
-  void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
-
-  // No finishing is required, because the entire hash value is used.
-  uint64_t getHash() { return Hash; }
-};
-
-} // end anonymous namespace
-
-// A function hash is calculated by considering only the number of arguments and
-// whether a function is varargs, the order of basic blocks (given by the
-// successors of each basic block in depth first order), and the order of
-// opcodes of each instruction within each of these basic blocks. This mirrors
-// the strategy compare() uses to compare functions by walking the BBs in depth
-// first order and comparing each instruction in sequence. Because this hash
-// does not look at the operands, it is insensitive to things such as the
-// target of calls and the constants used in the function, which makes it useful
-// when possibly merging functions which are the same modulo constants and call
-// targets.
-FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
-  HashAccumulator64 H;
-  H.add(F.isVarArg());
-  H.add(F.arg_size());
-
-  SmallVector<const BasicBlock *, 8> BBs;
-  SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
-
-  // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
-  // accumulating the hash of the function "structure." (BB and opcode sequence)
-  BBs.push_back(&F.getEntryBlock());
-  VisitedBBs.insert(BBs[0]);
-  while (!BBs.empty()) {
-    const BasicBlock *BB = BBs.pop_back_val();
-    // This random value acts as a block header, as otherwise the partition of
-    // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
-    H.add(45798);
-    for (const auto &Inst : *BB) {
-      H.add(Inst.getOpcode());
-    }
-    const Instruction *Term = BB->getTerminator();
-    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-      if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
-        continue;
-      BBs.push_back(Term->getSuccessor(i));
-    }
-  }
-  return H.getHash();
-}