|
| 1 | +//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// \file |
| 10 | +// Interface file for the IRSimilarityIdentifier for identifying similarities in |
| 11 | +// IR including the IRInstructionMapper, which maps an Instruction to unsigned |
| 12 | +// integers. |
| 13 | +// |
| 14 | +// Two sequences of instructions are called "similar" if they perform the same |
| 15 | +// series of operations for all inputs. |
| 16 | +// |
| 17 | +// \code |
| 18 | +// %1 = add i32 %a, 10 |
| 19 | +// %2 = add i32 %a, %1 |
| 20 | +// %3 = icmp slt icmp %1, %2 |
| 21 | +// \endcode |
| 22 | +// |
| 23 | +// and |
| 24 | +// |
| 25 | +// \code |
| 26 | +// %1 = add i32 11, %a |
| 27 | +// %2 = sub i32 %a, %1 |
| 28 | +// %3 = icmp sgt icmp %2, %1 |
| 29 | +// \endcode |
| 30 | +// |
| 31 | +// ultimately have the same result, even if the inputs, and structure are |
| 32 | +// slightly different. |
| 33 | +// |
| 34 | +// For instructions, we do not worry about operands that do not have fixed |
| 35 | +// semantic meaning to the program. We consider the opcode that the instruction |
| 36 | +// has, the types, parameters, and extra information such as the function name, |
| 37 | +// or comparison predicate. These are used to create a hash to map instructions |
| 38 | +// to integers to be used in similarity matching in sequences of instructions |
| 39 | +// |
| 40 | +//===----------------------------------------------------------------------===// |
| 41 | + |
| 42 | +#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H |
| 43 | +#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H |
| 44 | + |
| 45 | +#include "llvm/IR/InstVisitor.h" |
| 46 | +#include "llvm/IR/Instructions.h" |
| 47 | +#include "llvm/IR/Module.h" |
| 48 | +#include "llvm/Support/Allocator.h" |
| 49 | + |
| 50 | +namespace llvm { |
| 51 | +namespace IRSimilarity { |
| 52 | + |
| 53 | +/// This represents what is and is not supported when finding similarity in |
| 54 | +/// Instructions. |
| 55 | +/// |
| 56 | +/// Legal Instructions are considered when looking at similarity between |
| 57 | +/// Instructions. |
| 58 | +/// |
| 59 | +/// Illegal Instructions cannot be considered when looking for similarity |
| 60 | +/// between Instructions. They act as boundaries between similarity regions. |
| 61 | +/// |
| 62 | +/// Invisible Instructions are skipped over during analysis. |
| 63 | +// TODO: Shared with MachineOutliner |
| 64 | +enum InstrType { Legal, Illegal, Invisible }; |
| 65 | + |
| 66 | +/// This provides the utilities for hashing an Instruction to an unsigned |
| 67 | +/// integer. Two IRInstructionDatas produce the same hash value when their |
| 68 | +/// underlying Instructions perform the same operation (even if they don't have |
| 69 | +/// the same input operands.) |
| 70 | +/// As a more concrete example, consider the following: |
| 71 | +/// |
| 72 | +/// \code |
| 73 | +/// %add1 = add i32 %a, %b |
| 74 | +/// %add2 = add i32 %c, %d |
| 75 | +/// %add3 = add i64 %e, %f |
| 76 | +/// \endcode |
| 77 | +/// |
| 78 | +// Then the IRInstructionData wrappers for these Instructions may be hashed like |
| 79 | +/// so: |
| 80 | +/// |
| 81 | +/// \code |
| 82 | +/// ; These two adds have the same types and operand types, so they hash to the |
| 83 | +/// ; same number. |
| 84 | +/// %add1 = add i32 %a, %b ; Hash: 1 |
| 85 | +/// %add2 = add i32 %c, %d ; Hash: 1 |
| 86 | +/// ; This add produces an i64. This differentiates it from %add1 and %add2. So, |
| 87 | +/// ; it hashes to a different number. |
| 88 | +/// %add3 = add i64 %e, %f; Hash: 2 |
| 89 | +/// \endcode |
| 90 | +/// |
| 91 | +/// |
| 92 | +/// This hashing scheme will be used to represent the program as a very long |
| 93 | +/// string. This string can then be placed in a data structure which can be used |
| 94 | +/// for similarity queries. |
| 95 | +/// |
| 96 | +/// TODO: Handle types of Instructions which can be equal even with different |
| 97 | +/// operands. (E.g. comparisons with swapped predicates.) |
| 98 | +/// TODO: Handle CallInsts, which are only checked for function type |
| 99 | +/// by \ref isSameOperationAs. |
| 100 | +/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the |
| 101 | +/// exact same, and some do not. |
| 102 | +struct IRInstructionData : ilist_node<IRInstructionData> { |
| 103 | + |
| 104 | + /// The source Instruction that is being wrapped. |
| 105 | + Instruction *Inst = nullptr; |
| 106 | + /// The values of the operands in the Instruction. |
| 107 | + SmallVector<Value *, 4> OperVals; |
| 108 | + /// The legality of the wrapped instruction. This is informed by InstrType, |
| 109 | + /// and is used when checking when two instructions are considered similar. |
| 110 | + /// If either instruction is not legal, the instructions are automatically not |
| 111 | + /// considered similar. |
| 112 | + bool Legal; |
| 113 | + |
| 114 | + /// Gather the information that is difficult to gather for an Instruction, or |
| 115 | + /// is changed. i.e. the operands of an Instruction and the Types of those |
| 116 | + /// operands. This extra information allows for similarity matching to make |
| 117 | + /// assertions that allow for more flexibility when checking for whether an |
| 118 | + /// Instruction performs the same operation. |
| 119 | + IRInstructionData(Instruction &I, bool Legality); |
| 120 | + |
| 121 | + /// Hashes \p Value based on its opcode, types, and operand types. |
| 122 | + /// Two IRInstructionData instances produce the same hash when they perform |
| 123 | + /// the same operation. |
| 124 | + /// |
| 125 | + /// As a simple example, consider the following instructions. |
| 126 | + /// |
| 127 | + /// \code |
| 128 | + /// %add1 = add i32 %x1, %y1 |
| 129 | + /// %add2 = add i32 %x2, %y2 |
| 130 | + /// |
| 131 | + /// %sub = sub i32 %x1, %y1 |
| 132 | + /// |
| 133 | + /// %add_i64 = add i64 %x2, %y2 |
| 134 | + /// \endcode |
| 135 | + /// |
| 136 | + /// Because the first two adds operate the same types, and are performing the |
| 137 | + /// same action, they will be hashed to the same value. |
| 138 | + /// |
| 139 | + /// However, the subtraction instruction is not the same as an addition, and |
| 140 | + /// will be hashed to a different value. |
| 141 | + /// |
| 142 | + /// Finally, the last add has a different type compared to the first two add |
| 143 | + /// instructions, so it will also be hashed to a different value that any of |
| 144 | + /// the previous instructions. |
| 145 | + /// |
| 146 | + /// \param [in] Value - The IRInstructionData instance to be hashed. |
| 147 | + /// \returns A hash_value of the IRInstructionData. |
| 148 | + friend hash_code hash_value(const IRInstructionData &ID) { |
| 149 | + SmallVector<Type *, 4> OperTypes; |
| 150 | + for (Value *V : ID.OperVals) |
| 151 | + OperTypes.push_back(V->getType()); |
| 152 | + |
| 153 | + return hash_combine( |
| 154 | + hash_value(ID.Inst->getOpcode()), hash_value(ID.Inst->getType()), |
| 155 | + hash_combine_range(OperTypes.begin(), OperTypes.end())); |
| 156 | + } |
| 157 | +}; |
| 158 | + |
| 159 | +/// Compare one IRInstructionData class to another IRInstructionData class for |
| 160 | +/// whether they are performing a the same operation, and can mapped to the |
| 161 | +/// same value. For regular instructions if the hash value is the same, then |
| 162 | +/// they will also be close. |
| 163 | +/// |
| 164 | +/// \param A - The first IRInstructionData class to compare |
| 165 | +/// \param B - The second IRInstructionData class to compare |
| 166 | +/// \returns true if \p A and \p B are similar enough to be mapped to the same |
| 167 | +/// value. |
| 168 | +bool isClose(const IRInstructionData &A, const IRInstructionData &B); |
| 169 | + |
| 170 | +struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { |
| 171 | + static inline IRInstructionData *getEmptyKey() { return nullptr; } |
| 172 | + static inline IRInstructionData *getTombstoneKey() { |
| 173 | + return reinterpret_cast<IRInstructionData *>(-1); |
| 174 | + } |
| 175 | + |
| 176 | + static unsigned getHashValue(const IRInstructionData *E) { |
| 177 | + using llvm::hash_value; |
| 178 | + assert(E && "IRInstructionData is a nullptr?"); |
| 179 | + return hash_value(*E); |
| 180 | + } |
| 181 | + |
| 182 | + static bool isEqual(const IRInstructionData *LHS, |
| 183 | + const IRInstructionData *RHS) { |
| 184 | + if (RHS == getEmptyKey() || RHS == getTombstoneKey() || |
| 185 | + LHS == getEmptyKey() || LHS == getTombstoneKey()) |
| 186 | + return LHS == RHS; |
| 187 | + |
| 188 | + assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?"); |
| 189 | + return isClose(*LHS, *RHS); |
| 190 | + } |
| 191 | +}; |
| 192 | + |
| 193 | +/// Helper struct for converting the Instructions in a Module into a vector of |
| 194 | +/// unsigned integers. This vector of unsigned integers can be thought of as a |
| 195 | +/// "numeric string". This numeric string can then be queried by, for example, |
| 196 | +/// data structures that find repeated substrings. |
| 197 | +/// |
| 198 | +/// This hashing is done per BasicBlock in the module. To hash Instructions |
| 199 | +/// based off of their operations, each Instruction is wrapped in an |
| 200 | +/// IRInstructionData struct. The unsigned integer for an IRInstructionData |
| 201 | +/// depends on: |
| 202 | +/// - The hash provided by the IRInstructionData. |
| 203 | +/// - Which member of InstrType the IRInstructionData is classified as. |
| 204 | +// See InstrType for more details on the possible classifications, and how they |
| 205 | +// manifest in the numeric string. |
| 206 | +/// |
| 207 | +/// The numeric string for an individual BasicBlock is terminated by an unique |
| 208 | +/// unsigned integer. This prevents data structures which rely on repetition |
| 209 | +/// from matching across BasicBlocks. (For example, the SuffixTree.) |
| 210 | +/// As a concrete example, if we have the following two BasicBlocks: |
| 211 | +/// \code |
| 212 | +/// bb0: |
| 213 | +/// %add1 = add i32 %a, %b |
| 214 | +/// %add2 = add i32 %c, %d |
| 215 | +/// %add3 = add i64 %e, %f |
| 216 | +/// bb1: |
| 217 | +/// %sub = sub i32 %c, %d |
| 218 | +/// \endcode |
| 219 | +/// We may hash the Instructions like this (via IRInstructionData): |
| 220 | +/// \code |
| 221 | +/// bb0: |
| 222 | +/// %add1 = add i32 %a, %b ; Hash: 1 |
| 223 | +/// %add2 = add i32 %c, %d; Hash: 1 |
| 224 | +/// %add3 = add i64 %e, %f; Hash: 2 |
| 225 | +/// bb1: |
| 226 | +/// %sub = sub i32 %c, %d; Hash: 3 |
| 227 | +/// %add4 = add i32 %c, %d ; Hash: 1 |
| 228 | +/// \endcode |
| 229 | +/// And produce a "numeric string representation" like so: |
| 230 | +/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 |
| 231 | +/// |
| 232 | +/// TODO: This is very similar to the MachineOutliner, and should be |
| 233 | +/// consolidated into the same interface. |
| 234 | +struct IRInstructionMapper { |
| 235 | + /// The starting illegal instruction number to map to. |
| 236 | + /// |
| 237 | + /// Set to -3 for compatibility with DenseMapInfo<unsigned>. |
| 238 | + unsigned IllegalInstrNumber = static_cast<unsigned>(-3); |
| 239 | + |
| 240 | + /// The next available integer to assign to a legal Instruction to. |
| 241 | + unsigned LegalInstrNumber = 0; |
| 242 | + |
| 243 | + /// Correspondence from IRInstructionData to unsigned integers. |
| 244 | + DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits> |
| 245 | + InstructionIntegerMap; |
| 246 | + |
| 247 | + /// Set if we added an illegal number in the previous step. |
| 248 | + /// Since each illegal number is unique, we only need one of them between |
| 249 | + /// each range of legal numbers. This lets us make sure we don't add more |
| 250 | + /// than one illegal number per range. |
| 251 | + bool AddedIllegalLastTime = false; |
| 252 | + |
| 253 | + /// Marks whether we found a illegal instruction in the previous step. |
| 254 | + bool CanCombineWithPrevInstr = false; |
| 255 | + |
| 256 | + /// Marks whether we have found a set of instructions that is long enough |
| 257 | + /// to be considered for similarity. |
| 258 | + bool HaveLegalRange = false; |
| 259 | + |
| 260 | + /// This allocator pointer is in charge of holding on to the IRInstructionData |
| 261 | + /// so it is not deallocated until whatever external tool is using it is done |
| 262 | + /// with the information. |
| 263 | + BumpPtrAllocator *InstDataAllocator = nullptr; |
| 264 | + |
| 265 | + /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers |
| 266 | + /// determined by \p InstrType. Two Instructions are mapped to the same value |
| 267 | + /// if they are close as defined by the InstructionData class above. |
| 268 | + /// |
| 269 | + /// \param [in] BB - The BasicBlock to be mapped to integers. |
| 270 | + /// \param [in,out] InstrList - Vector of IRInstructionData to append to. |
| 271 | + /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. |
| 272 | + void convertToUnsignedVec(BasicBlock &BB, |
| 273 | + std::vector<IRInstructionData *> &InstrList, |
| 274 | + std::vector<unsigned> &IntegerMapping); |
| 275 | + |
| 276 | + /// Maps an Instruction to a legal integer. |
| 277 | + /// |
| 278 | + /// \param [in] It - The Instruction to be mapped to an integer. |
| 279 | + /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to |
| 280 | + /// append to. |
| 281 | + /// \param [in,out] InstrList - Vector of InstructionData to append |
| 282 | + /// to. \returns The integer \p It was mapped to. |
| 283 | + unsigned mapToLegalUnsigned(BasicBlock::iterator &It, |
| 284 | + std::vector<unsigned> &IntegerMappingForBB, |
| 285 | + std::vector<IRInstructionData *> &InstrListForBB); |
| 286 | + |
| 287 | + /// Maps an Instruction to an illegal integer. |
| 288 | + /// |
| 289 | + /// \param [in] It - The \p Instruction to be mapped to an integer. |
| 290 | + /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to |
| 291 | + /// append to. |
| 292 | + /// \param [in,out] InstrList - Vector of IRInstructionData to append to. |
| 293 | + /// \param End - true if creating a dummy IRInstructionData at the end of a |
| 294 | + /// basic block. |
| 295 | + /// \returns The integer \p It was mapped to. |
| 296 | + unsigned mapToIllegalUnsigned( |
| 297 | + BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB, |
| 298 | + std::vector<IRInstructionData *> &InstrListForBB, bool End = false); |
| 299 | + |
| 300 | + IRInstructionMapper(BumpPtrAllocator *IDA) : InstDataAllocator(IDA) { |
| 301 | + // Make sure that the implementation of DenseMapInfo<unsigned> hasn't |
| 302 | + // changed. |
| 303 | + assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && |
| 304 | + "DenseMapInfo<unsigned>'s empty key isn't -1!"); |
| 305 | + assert(DenseMapInfo<unsigned>::getTombstoneKey() == |
| 306 | + static_cast<unsigned>(-2) && |
| 307 | + "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); |
| 308 | + } |
| 309 | + |
| 310 | + /// Custom InstVisitor to classify different instructions for whether it can |
| 311 | + /// be analyzed for similarity. |
| 312 | + struct InstructionClassification |
| 313 | + : public InstVisitor<InstructionClassification, InstrType> { |
| 314 | + InstructionClassification() {} |
| 315 | + |
| 316 | + // TODO: Determine a scheme to resolve when the label is similar enough. |
| 317 | + InstrType visitBranchInst(BranchInst &BI) { return Illegal; } |
| 318 | + // TODO: Determine a scheme to resolve when the labels are similar enough. |
| 319 | + InstrType visitPHINode(PHINode &PN) { return Illegal; } |
| 320 | + // TODO: Handle allocas. |
| 321 | + InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } |
| 322 | + // We exclude variable argument instructions since variable arguments |
| 323 | + // requires extra checking of the argument list. |
| 324 | + InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } |
| 325 | + // We exclude all exception handling cases since they are so context |
| 326 | + // dependent. |
| 327 | + InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } |
| 328 | + InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } |
| 329 | + // DebugInfo should be included in the regions, but should not be |
| 330 | + // analyzed for similarity as it has no bearing on the outcome of the |
| 331 | + // program. |
| 332 | + InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } |
| 333 | + // TODO: Handle GetElementPtrInsts |
| 334 | + InstrType visitGetElementPtrInst(GetElementPtrInst &GEPI) { |
| 335 | + return Illegal; |
| 336 | + } |
| 337 | + // TODO: Handle specific intrinsics. |
| 338 | + InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; } |
| 339 | + // TODO: Handle CallInsts. |
| 340 | + InstrType visitCallInst(CallInst &CI) { return Illegal; } |
| 341 | + // TODO: We do not current handle similarity that changes the control flow. |
| 342 | + InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } |
| 343 | + // TODO: We do not current handle similarity that changes the control flow. |
| 344 | + InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } |
| 345 | + // TODO: Handle interblock similarity. |
| 346 | + InstrType visitTerminator(Instruction &I) { return Illegal; } |
| 347 | + InstrType visitInstruction(Instruction &I) { return Legal; } |
| 348 | + }; |
| 349 | + |
| 350 | + /// Maps an Instruction to a member of InstrType. |
| 351 | + InstructionClassification InstClassifier; |
| 352 | +}; |
| 353 | + |
| 354 | +} // end namespace IRSimilarity |
| 355 | +} // end namespace llvm |
| 356 | + |
| 357 | +#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H |
0 commit comments