Skip to content

Commit b04c1a9

Browse files
[IRSim] Adding IR Instruction Mapper
This introduces the IRInstructionMapper, and the associated wrapper for instructions, IRInstructionData, that maps IR level Instructions to unsigned integers. Mapping is done mainly by using the "isSameOperationAs" comparison between two instructions. If they return true, the opcode, result type, and operand types of the instruction are used to hash the instruction with an unsigned integer. The mapper accepts instruction ranges, and adds each resulting integer to a list, and each wrapped instruction to a separate list. At present, branches, phi nodes are not mapping and exception handling is illegal. Debug instructions are not considered. The different mapping schemes are tested in unittests/Analysis/IRSimilarityIdentifierTest.cpp Differential Revision: https://reviews.llvm.org/D86968
1 parent f4ea0f9 commit b04c1a9

File tree

5 files changed

+1689
-0
lines changed

5 files changed

+1689
-0
lines changed
Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// \file
10+
// Interface file for the IRSimilarityIdentifier for identifying similarities in
11+
// IR including the IRInstructionMapper, which maps an Instruction to unsigned
12+
// integers.
13+
//
14+
// Two sequences of instructions are called "similar" if they perform the same
15+
// series of operations for all inputs.
16+
//
17+
// \code
18+
// %1 = add i32 %a, 10
19+
// %2 = add i32 %a, %1
20+
// %3 = icmp slt icmp %1, %2
21+
// \endcode
22+
//
23+
// and
24+
//
25+
// \code
26+
// %1 = add i32 11, %a
27+
// %2 = sub i32 %a, %1
28+
// %3 = icmp sgt icmp %2, %1
29+
// \endcode
30+
//
31+
// ultimately have the same result, even if the inputs, and structure are
32+
// slightly different.
33+
//
34+
// For instructions, we do not worry about operands that do not have fixed
35+
// semantic meaning to the program. We consider the opcode that the instruction
36+
// has, the types, parameters, and extra information such as the function name,
37+
// or comparison predicate. These are used to create a hash to map instructions
38+
// to integers to be used in similarity matching in sequences of instructions
39+
//
40+
//===----------------------------------------------------------------------===//
41+
42+
#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
43+
#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
44+
45+
#include "llvm/IR/InstVisitor.h"
46+
#include "llvm/IR/Instructions.h"
47+
#include "llvm/IR/Module.h"
48+
#include "llvm/Support/Allocator.h"
49+
50+
namespace llvm {
51+
namespace IRSimilarity {
52+
53+
/// This represents what is and is not supported when finding similarity in
54+
/// Instructions.
55+
///
56+
/// Legal Instructions are considered when looking at similarity between
57+
/// Instructions.
58+
///
59+
/// Illegal Instructions cannot be considered when looking for similarity
60+
/// between Instructions. They act as boundaries between similarity regions.
61+
///
62+
/// Invisible Instructions are skipped over during analysis.
63+
// TODO: Shared with MachineOutliner
64+
enum InstrType { Legal, Illegal, Invisible };
65+
66+
/// This provides the utilities for hashing an Instruction to an unsigned
67+
/// integer. Two IRInstructionDatas produce the same hash value when their
68+
/// underlying Instructions perform the same operation (even if they don't have
69+
/// the same input operands.)
70+
/// As a more concrete example, consider the following:
71+
///
72+
/// \code
73+
/// %add1 = add i32 %a, %b
74+
/// %add2 = add i32 %c, %d
75+
/// %add3 = add i64 %e, %f
76+
/// \endcode
77+
///
78+
// Then the IRInstructionData wrappers for these Instructions may be hashed like
79+
/// so:
80+
///
81+
/// \code
82+
/// ; These two adds have the same types and operand types, so they hash to the
83+
/// ; same number.
84+
/// %add1 = add i32 %a, %b ; Hash: 1
85+
/// %add2 = add i32 %c, %d ; Hash: 1
86+
/// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
87+
/// ; it hashes to a different number.
88+
/// %add3 = add i64 %e, %f; Hash: 2
89+
/// \endcode
90+
///
91+
///
92+
/// This hashing scheme will be used to represent the program as a very long
93+
/// string. This string can then be placed in a data structure which can be used
94+
/// for similarity queries.
95+
///
96+
/// TODO: Handle types of Instructions which can be equal even with different
97+
/// operands. (E.g. comparisons with swapped predicates.)
98+
/// TODO: Handle CallInsts, which are only checked for function type
99+
/// by \ref isSameOperationAs.
100+
/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
101+
/// exact same, and some do not.
102+
struct IRInstructionData : ilist_node<IRInstructionData> {
103+
104+
/// The source Instruction that is being wrapped.
105+
Instruction *Inst = nullptr;
106+
/// The values of the operands in the Instruction.
107+
SmallVector<Value *, 4> OperVals;
108+
/// The legality of the wrapped instruction. This is informed by InstrType,
109+
/// and is used when checking when two instructions are considered similar.
110+
/// If either instruction is not legal, the instructions are automatically not
111+
/// considered similar.
112+
bool Legal;
113+
114+
/// Gather the information that is difficult to gather for an Instruction, or
115+
/// is changed. i.e. the operands of an Instruction and the Types of those
116+
/// operands. This extra information allows for similarity matching to make
117+
/// assertions that allow for more flexibility when checking for whether an
118+
/// Instruction performs the same operation.
119+
IRInstructionData(Instruction &I, bool Legality);
120+
121+
/// Hashes \p Value based on its opcode, types, and operand types.
122+
/// Two IRInstructionData instances produce the same hash when they perform
123+
/// the same operation.
124+
///
125+
/// As a simple example, consider the following instructions.
126+
///
127+
/// \code
128+
/// %add1 = add i32 %x1, %y1
129+
/// %add2 = add i32 %x2, %y2
130+
///
131+
/// %sub = sub i32 %x1, %y1
132+
///
133+
/// %add_i64 = add i64 %x2, %y2
134+
/// \endcode
135+
///
136+
/// Because the first two adds operate the same types, and are performing the
137+
/// same action, they will be hashed to the same value.
138+
///
139+
/// However, the subtraction instruction is not the same as an addition, and
140+
/// will be hashed to a different value.
141+
///
142+
/// Finally, the last add has a different type compared to the first two add
143+
/// instructions, so it will also be hashed to a different value that any of
144+
/// the previous instructions.
145+
///
146+
/// \param [in] Value - The IRInstructionData instance to be hashed.
147+
/// \returns A hash_value of the IRInstructionData.
148+
friend hash_code hash_value(const IRInstructionData &ID) {
149+
SmallVector<Type *, 4> OperTypes;
150+
for (Value *V : ID.OperVals)
151+
OperTypes.push_back(V->getType());
152+
153+
return hash_combine(
154+
hash_value(ID.Inst->getOpcode()), hash_value(ID.Inst->getType()),
155+
hash_combine_range(OperTypes.begin(), OperTypes.end()));
156+
}
157+
};
158+
159+
/// Compare one IRInstructionData class to another IRInstructionData class for
160+
/// whether they are performing a the same operation, and can mapped to the
161+
/// same value. For regular instructions if the hash value is the same, then
162+
/// they will also be close.
163+
///
164+
/// \param A - The first IRInstructionData class to compare
165+
/// \param B - The second IRInstructionData class to compare
166+
/// \returns true if \p A and \p B are similar enough to be mapped to the same
167+
/// value.
168+
bool isClose(const IRInstructionData &A, const IRInstructionData &B);
169+
170+
struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
171+
static inline IRInstructionData *getEmptyKey() { return nullptr; }
172+
static inline IRInstructionData *getTombstoneKey() {
173+
return reinterpret_cast<IRInstructionData *>(-1);
174+
}
175+
176+
static unsigned getHashValue(const IRInstructionData *E) {
177+
using llvm::hash_value;
178+
assert(E && "IRInstructionData is a nullptr?");
179+
return hash_value(*E);
180+
}
181+
182+
static bool isEqual(const IRInstructionData *LHS,
183+
const IRInstructionData *RHS) {
184+
if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
185+
LHS == getEmptyKey() || LHS == getTombstoneKey())
186+
return LHS == RHS;
187+
188+
assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
189+
return isClose(*LHS, *RHS);
190+
}
191+
};
192+
193+
/// Helper struct for converting the Instructions in a Module into a vector of
194+
/// unsigned integers. This vector of unsigned integers can be thought of as a
195+
/// "numeric string". This numeric string can then be queried by, for example,
196+
/// data structures that find repeated substrings.
197+
///
198+
/// This hashing is done per BasicBlock in the module. To hash Instructions
199+
/// based off of their operations, each Instruction is wrapped in an
200+
/// IRInstructionData struct. The unsigned integer for an IRInstructionData
201+
/// depends on:
202+
/// - The hash provided by the IRInstructionData.
203+
/// - Which member of InstrType the IRInstructionData is classified as.
204+
// See InstrType for more details on the possible classifications, and how they
205+
// manifest in the numeric string.
206+
///
207+
/// The numeric string for an individual BasicBlock is terminated by an unique
208+
/// unsigned integer. This prevents data structures which rely on repetition
209+
/// from matching across BasicBlocks. (For example, the SuffixTree.)
210+
/// As a concrete example, if we have the following two BasicBlocks:
211+
/// \code
212+
/// bb0:
213+
/// %add1 = add i32 %a, %b
214+
/// %add2 = add i32 %c, %d
215+
/// %add3 = add i64 %e, %f
216+
/// bb1:
217+
/// %sub = sub i32 %c, %d
218+
/// \endcode
219+
/// We may hash the Instructions like this (via IRInstructionData):
220+
/// \code
221+
/// bb0:
222+
/// %add1 = add i32 %a, %b ; Hash: 1
223+
/// %add2 = add i32 %c, %d; Hash: 1
224+
/// %add3 = add i64 %e, %f; Hash: 2
225+
/// bb1:
226+
/// %sub = sub i32 %c, %d; Hash: 3
227+
/// %add4 = add i32 %c, %d ; Hash: 1
228+
/// \endcode
229+
/// And produce a "numeric string representation" like so:
230+
/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
231+
///
232+
/// TODO: This is very similar to the MachineOutliner, and should be
233+
/// consolidated into the same interface.
234+
struct IRInstructionMapper {
235+
/// The starting illegal instruction number to map to.
236+
///
237+
/// Set to -3 for compatibility with DenseMapInfo<unsigned>.
238+
unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
239+
240+
/// The next available integer to assign to a legal Instruction to.
241+
unsigned LegalInstrNumber = 0;
242+
243+
/// Correspondence from IRInstructionData to unsigned integers.
244+
DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
245+
InstructionIntegerMap;
246+
247+
/// Set if we added an illegal number in the previous step.
248+
/// Since each illegal number is unique, we only need one of them between
249+
/// each range of legal numbers. This lets us make sure we don't add more
250+
/// than one illegal number per range.
251+
bool AddedIllegalLastTime = false;
252+
253+
/// Marks whether we found a illegal instruction in the previous step.
254+
bool CanCombineWithPrevInstr = false;
255+
256+
/// Marks whether we have found a set of instructions that is long enough
257+
/// to be considered for similarity.
258+
bool HaveLegalRange = false;
259+
260+
/// This allocator pointer is in charge of holding on to the IRInstructionData
261+
/// so it is not deallocated until whatever external tool is using it is done
262+
/// with the information.
263+
BumpPtrAllocator *InstDataAllocator = nullptr;
264+
265+
/// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
266+
/// determined by \p InstrType. Two Instructions are mapped to the same value
267+
/// if they are close as defined by the InstructionData class above.
268+
///
269+
/// \param [in] BB - The BasicBlock to be mapped to integers.
270+
/// \param [in,out] InstrList - Vector of IRInstructionData to append to.
271+
/// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
272+
void convertToUnsignedVec(BasicBlock &BB,
273+
std::vector<IRInstructionData *> &InstrList,
274+
std::vector<unsigned> &IntegerMapping);
275+
276+
/// Maps an Instruction to a legal integer.
277+
///
278+
/// \param [in] It - The Instruction to be mapped to an integer.
279+
/// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
280+
/// append to.
281+
/// \param [in,out] InstrList - Vector of InstructionData to append
282+
/// to. \returns The integer \p It was mapped to.
283+
unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
284+
std::vector<unsigned> &IntegerMappingForBB,
285+
std::vector<IRInstructionData *> &InstrListForBB);
286+
287+
/// Maps an Instruction to an illegal integer.
288+
///
289+
/// \param [in] It - The \p Instruction to be mapped to an integer.
290+
/// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
291+
/// append to.
292+
/// \param [in,out] InstrList - Vector of IRInstructionData to append to.
293+
/// \param End - true if creating a dummy IRInstructionData at the end of a
294+
/// basic block.
295+
/// \returns The integer \p It was mapped to.
296+
unsigned mapToIllegalUnsigned(
297+
BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
298+
std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
299+
300+
IRInstructionMapper(BumpPtrAllocator *IDA) : InstDataAllocator(IDA) {
301+
// Make sure that the implementation of DenseMapInfo<unsigned> hasn't
302+
// changed.
303+
assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
304+
"DenseMapInfo<unsigned>'s empty key isn't -1!");
305+
assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
306+
static_cast<unsigned>(-2) &&
307+
"DenseMapInfo<unsigned>'s tombstone key isn't -2!");
308+
}
309+
310+
/// Custom InstVisitor to classify different instructions for whether it can
311+
/// be analyzed for similarity.
312+
struct InstructionClassification
313+
: public InstVisitor<InstructionClassification, InstrType> {
314+
InstructionClassification() {}
315+
316+
// TODO: Determine a scheme to resolve when the label is similar enough.
317+
InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
318+
// TODO: Determine a scheme to resolve when the labels are similar enough.
319+
InstrType visitPHINode(PHINode &PN) { return Illegal; }
320+
// TODO: Handle allocas.
321+
InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
322+
// We exclude variable argument instructions since variable arguments
323+
// requires extra checking of the argument list.
324+
InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
325+
// We exclude all exception handling cases since they are so context
326+
// dependent.
327+
InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
328+
InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
329+
// DebugInfo should be included in the regions, but should not be
330+
// analyzed for similarity as it has no bearing on the outcome of the
331+
// program.
332+
InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
333+
// TODO: Handle GetElementPtrInsts
334+
InstrType visitGetElementPtrInst(GetElementPtrInst &GEPI) {
335+
return Illegal;
336+
}
337+
// TODO: Handle specific intrinsics.
338+
InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
339+
// TODO: Handle CallInsts.
340+
InstrType visitCallInst(CallInst &CI) { return Illegal; }
341+
// TODO: We do not current handle similarity that changes the control flow.
342+
InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
343+
// TODO: We do not current handle similarity that changes the control flow.
344+
InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
345+
// TODO: Handle interblock similarity.
346+
InstrType visitTerminator(Instruction &I) { return Illegal; }
347+
InstrType visitInstruction(Instruction &I) { return Legal; }
348+
};
349+
350+
/// Maps an Instruction to a member of InstrType.
351+
InstructionClassification InstClassifier;
352+
};
353+
354+
} // end namespace IRSimilarity
355+
} // end namespace llvm
356+
357+
#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H

llvm/lib/Analysis/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ add_llvm_component_library(LLVMAnalysis
5454
GlobalsModRef.cpp
5555
GuardUtils.cpp
5656
HeatUtils.cpp
57+
IRSimilarityIdentifier.cpp
5758
IVDescriptors.cpp
5859
IVUsers.cpp
5960
IndirectCallPromotionAnalysis.cpp

0 commit comments

Comments
 (0)