Skip to content

Commit 998334d

Browse files
author
Kostya Kortchinsky
committed
[scudo][standalone] Change the release loop for efficiency purposes
Summary: On 32-b, the release algo loops multiple times over the freelist for a size class, which lead to a decrease in performance when there were a lot of free blocks. This changes the release functions to loop only once over the freelist, at the cost of using a little bit more memory for the release process: instead of working on one region at a time, we pass the whole memory area covered by all the regions for a given size class, and work on sub-areas of `RegionSize` in this large area. For 64-b, we just have 1 sub-area encompassing the whole region. Of course, not all the sub-areas within that large memory area will belong to the class id we are working on, but those will just be left untouched (which will not add to the RSS during the release process). Reviewers: pcc, cferris, hctim, eugenis Subscribers: llvm-commits, #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D83993
1 parent 3319d05 commit 998334d

File tree

4 files changed

+114
-79
lines changed

4 files changed

+114
-79
lines changed

compiler-rt/lib/scudo/standalone/primary32.h

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -467,28 +467,33 @@ class SizeClassAllocator32 {
467467
}
468468
}
469469

470-
// TODO(kostyak): currently not ideal as we loop over all regions and
471-
// iterate multiple times over the same freelist if a ClassId spans multiple
472-
// regions. But it will have to do for now.
473-
uptr TotalReleasedBytes = 0;
474-
const uptr MaxSize = (RegionSize / BlockSize) * BlockSize;
470+
DCHECK_GT(MinRegionIndex, 0U);
471+
uptr First = 0;
475472
for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) {
476473
if (PossibleRegions[I] - 1U == ClassId) {
477-
const uptr Region = I * RegionSize;
478-
// If the region is the one currently associated to the size-class, we
479-
// only need to release up to CurrentRegionAllocated, MaxSize otherwise.
480-
const uptr Size = (Region == Sci->CurrentRegion)
481-
? Sci->CurrentRegionAllocated
482-
: MaxSize;
483-
ReleaseRecorder Recorder(Region);
484-
releaseFreeMemoryToOS(Sci->FreeList, Region, Size, BlockSize,
485-
&Recorder);
486-
if (Recorder.getReleasedRangesCount() > 0) {
487-
Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
488-
Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
489-
Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
490-
TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
491-
}
474+
First = I;
475+
break;
476+
}
477+
}
478+
uptr Last = 0;
479+
for (uptr I = MaxRegionIndex; I >= MinRegionIndex; I--) {
480+
if (PossibleRegions[I] - 1U == ClassId) {
481+
Last = I;
482+
break;
483+
}
484+
}
485+
uptr TotalReleasedBytes = 0;
486+
if (First && Last) {
487+
const uptr Base = First * RegionSize;
488+
const uptr NumberOfRegions = Last - First + 1U;
489+
ReleaseRecorder Recorder(Base);
490+
releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions,
491+
BlockSize, &Recorder);
492+
if (Recorder.getReleasedRangesCount() > 0) {
493+
Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
494+
Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
495+
Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
496+
TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
492497
}
493498
}
494499
Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();

compiler-rt/lib/scudo/standalone/primary64.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ class SizeClassAllocator64 {
481481

482482
ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data);
483483
releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg,
484-
Region->AllocatedUser, BlockSize, &Recorder);
484+
Region->AllocatedUser, 1U, BlockSize, &Recorder);
485485

486486
if (Recorder.getReleasedRangesCount() > 0) {
487487
Region->ReleaseInfo.PushedBlocksAtLastRelease =

compiler-rt/lib/scudo/standalone/release.h

Lines changed: 74 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@ class ReleaseRecorder {
4949
// incremented past MaxValue.
5050
class PackedCounterArray {
5151
public:
52-
PackedCounterArray(uptr NumCounters, uptr MaxValue) : N(NumCounters) {
52+
PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion,
53+
uptr MaxValue)
54+
: Regions(NumberOfRegions), NumCounters(CountersPerRegion) {
55+
CHECK_GT(Regions, 0);
5356
CHECK_GT(NumCounters, 0);
5457
CHECK_GT(MaxValue, 0);
5558
constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
@@ -66,9 +69,10 @@ class PackedCounterArray {
6669
PackingRatioLog = getLog2(PackingRatio);
6770
BitOffsetMask = PackingRatio - 1;
6871

69-
BufferSize = (roundUpTo(N, static_cast<uptr>(1U) << PackingRatioLog) >>
70-
PackingRatioLog) *
71-
sizeof(*Buffer);
72+
SizePerRegion =
73+
roundUpTo(NumCounters, static_cast<uptr>(1U) << PackingRatioLog) >>
74+
PackingRatioLog;
75+
BufferSize = SizePerRegion * sizeof(*Buffer) * Regions;
7276
if (BufferSize <= (StaticBufferCount * sizeof(Buffer[0])) &&
7377
Mutex.tryLock()) {
7478
Buffer = &StaticBuffer[0];
@@ -89,41 +93,45 @@ class PackedCounterArray {
8993

9094
bool isAllocated() const { return !!Buffer; }
9195

92-
uptr getCount() const { return N; }
96+
uptr getCount() const { return NumCounters; }
9397

94-
uptr get(uptr I) const {
95-
DCHECK_LT(I, N);
98+
uptr get(uptr Region, uptr I) const {
99+
DCHECK_LT(Region, Regions);
100+
DCHECK_LT(I, NumCounters);
96101
const uptr Index = I >> PackingRatioLog;
97102
const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
98-
return (Buffer[Index] >> BitOffset) & CounterMask;
103+
return (Buffer[Region * SizePerRegion + Index] >> BitOffset) & CounterMask;
99104
}
100105

101-
void inc(uptr I) const {
102-
DCHECK_LT(get(I), CounterMask);
106+
void inc(uptr Region, uptr I) const {
107+
DCHECK_LT(get(Region, I), CounterMask);
103108
const uptr Index = I >> PackingRatioLog;
104109
const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
105110
DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
106-
Buffer[Index] += static_cast<uptr>(1U) << BitOffset;
111+
Buffer[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
112+
<< BitOffset;
107113
}
108114

109-
void incRange(uptr From, uptr To) const {
115+
void incRange(uptr Region, uptr From, uptr To) const {
110116
DCHECK_LE(From, To);
111-
const uptr Top = Min(To + 1, N);
117+
const uptr Top = Min(To + 1, NumCounters);
112118
for (uptr I = From; I < Top; I++)
113-
inc(I);
119+
inc(Region, I);
114120
}
115121

116122
uptr getBufferSize() const { return BufferSize; }
117123

118-
static const uptr StaticBufferCount = 1024U;
124+
static const uptr StaticBufferCount = 2048U;
119125

120126
private:
121-
const uptr N;
127+
const uptr Regions;
128+
const uptr NumCounters;
122129
uptr CounterSizeBitsLog;
123130
uptr CounterMask;
124131
uptr PackingRatioLog;
125132
uptr BitOffsetMask;
126133

134+
uptr SizePerRegion;
127135
uptr BufferSize;
128136
uptr *Buffer;
129137

@@ -169,7 +177,8 @@ template <class ReleaseRecorderT> class FreePagesRangeTracker {
169177
template <class TransferBatchT, class ReleaseRecorderT>
170178
NOINLINE void
171179
releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
172-
uptr Size, uptr BlockSize, ReleaseRecorderT *Recorder) {
180+
uptr RegionSize, uptr NumberOfRegions, uptr BlockSize,
181+
ReleaseRecorderT *Recorder) {
173182
const uptr PageSize = getPageSizeCached();
174183

175184
// Figure out the number of chunks per page and whether we can take a fast
@@ -206,13 +215,15 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
206215
}
207216
}
208217

209-
const uptr PagesCount = roundUpTo(Size, PageSize) / PageSize;
210-
PackedCounterArray Counters(PagesCount, FullPagesBlockCountMax);
218+
const uptr PagesCount = roundUpTo(RegionSize, PageSize) / PageSize;
219+
PackedCounterArray Counters(NumberOfRegions, PagesCount,
220+
FullPagesBlockCountMax);
211221
if (!Counters.isAllocated())
212222
return;
213223

214224
const uptr PageSizeLog = getLog2(PageSize);
215-
const uptr RoundedSize = PagesCount << PageSizeLog;
225+
const uptr RoundedRegionSize = PagesCount << PageSizeLog;
226+
const uptr RoundedSize = NumberOfRegions * RoundedRegionSize;
216227

217228
// Iterate over free chunks and count how many free chunks affect each
218229
// allocated page.
@@ -228,14 +239,17 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
228239
for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
229240
const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
230241
// This takes care of P < Base and P >= Base + RoundedSize.
231-
if (P < RoundedSize)
232-
Counters.inc(P >> PageSizeLog);
242+
if (P < RoundedSize) {
243+
const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
244+
const uptr PInRegion = P - RegionIndex * RegionSize;
245+
Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
246+
}
233247
}
234248
}
235-
for (uptr P = Size; P < RoundedSize; P += BlockSize)
236-
Counters.inc(P >> PageSizeLog);
237249
} else {
238250
// In all other cases chunks might affect more than one page.
251+
DCHECK_GE(RegionSize, BlockSize);
252+
const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize;
239253
for (const auto &It : FreeList) {
240254
// See TransferBatch comment above.
241255
const bool IsTransferBatch =
@@ -244,22 +258,35 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
244258
for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
245259
const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
246260
// This takes care of P < Base and P >= Base + RoundedSize.
247-
if (P < RoundedSize)
248-
Counters.incRange(P >> PageSizeLog,
249-
(P + BlockSize - 1) >> PageSizeLog);
261+
if (P < RoundedSize) {
262+
const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
263+
uptr PInRegion = P - RegionIndex * RegionSize;
264+
Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
265+
(PInRegion + BlockSize - 1) >> PageSizeLog);
266+
// The last block in a region might straddle a page, so if it's
267+
// free, we mark the following "pretend" memory block(s) as free.
268+
if (PInRegion == LastBlockInRegion) {
269+
PInRegion += BlockSize;
270+
while (PInRegion < RoundedRegionSize) {
271+
Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
272+
(PInRegion + BlockSize - 1) >> PageSizeLog);
273+
PInRegion += BlockSize;
274+
}
275+
}
276+
}
250277
}
251278
}
252-
for (uptr P = Size; P < RoundedSize; P += BlockSize)
253-
Counters.incRange(P >> PageSizeLog, (P + BlockSize - 1) >> PageSizeLog);
254279
}
255280

256281
// Iterate over pages detecting ranges of pages with chunk Counters equal
257282
// to the expected number of chunks for the particular page.
258283
FreePagesRangeTracker<ReleaseRecorderT> RangeTracker(Recorder);
259284
if (SameBlockCountPerPage) {
260285
// Fast path, every page has the same number of chunks affecting it.
261-
for (uptr I = 0; I < Counters.getCount(); I++)
262-
RangeTracker.processNextPage(Counters.get(I) == FullPagesBlockCountMax);
286+
for (uptr I = 0; I < NumberOfRegions; I++)
287+
for (uptr J = 0; J < PagesCount; J++)
288+
RangeTracker.processNextPage(Counters.get(I, J) ==
289+
FullPagesBlockCountMax);
263290
} else {
264291
// Slow path, go through the pages keeping count how many chunks affect
265292
// each page.
@@ -270,23 +297,25 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
270297
// except the first and the last one) and then the last chunk size, adding
271298
// up the number of chunks on the current page and checking on every step
272299
// whether the page boundary was crossed.
273-
uptr PrevPageBoundary = 0;
274-
uptr CurrentBoundary = 0;
275-
for (uptr I = 0; I < Counters.getCount(); I++) {
276-
const uptr PageBoundary = PrevPageBoundary + PageSize;
277-
uptr BlocksPerPage = Pn;
278-
if (CurrentBoundary < PageBoundary) {
279-
if (CurrentBoundary > PrevPageBoundary)
280-
BlocksPerPage++;
281-
CurrentBoundary += Pnc;
300+
for (uptr I = 0; I < NumberOfRegions; I++) {
301+
uptr PrevPageBoundary = 0;
302+
uptr CurrentBoundary = 0;
303+
for (uptr J = 0; J < PagesCount; J++) {
304+
const uptr PageBoundary = PrevPageBoundary + PageSize;
305+
uptr BlocksPerPage = Pn;
282306
if (CurrentBoundary < PageBoundary) {
283-
BlocksPerPage++;
284-
CurrentBoundary += BlockSize;
307+
if (CurrentBoundary > PrevPageBoundary)
308+
BlocksPerPage++;
309+
CurrentBoundary += Pnc;
310+
if (CurrentBoundary < PageBoundary) {
311+
BlocksPerPage++;
312+
CurrentBoundary += BlockSize;
313+
}
285314
}
286-
}
287-
PrevPageBoundary = PageBoundary;
315+
PrevPageBoundary = PageBoundary;
288316

289-
RangeTracker.processNextPage(Counters.get(I) == BlocksPerPage);
317+
RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage);
318+
}
290319
}
291320
}
292321
RangeTracker.finish();

compiler-rt/lib/scudo/standalone/tests/release_test.cpp

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@
2121
TEST(ScudoReleaseTest, PackedCounterArray) {
2222
for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) {
2323
// Various valid counter's max values packed into one word.
24-
scudo::PackedCounterArray Counters2N(1, 1UL << I);
24+
scudo::PackedCounterArray Counters2N(1U, 1U, 1UL << I);
2525
EXPECT_EQ(sizeof(scudo::uptr), Counters2N.getBufferSize());
2626
// Check the "all bit set" values too.
27-
scudo::PackedCounterArray Counters2N1_1(1, ~0UL >> I);
27+
scudo::PackedCounterArray Counters2N1_1(1U, 1U, ~0UL >> I);
2828
EXPECT_EQ(sizeof(scudo::uptr), Counters2N1_1.getBufferSize());
2929
// Verify the packing ratio, the counter is Expected to be packed into the
3030
// closest power of 2 bits.
31-
scudo::PackedCounterArray Counters(SCUDO_WORDSIZE, 1UL << I);
31+
scudo::PackedCounterArray Counters(1U, SCUDO_WORDSIZE, 1UL << I);
3232
EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpToPowerOfTwo(I + 1),
3333
Counters.getBufferSize());
3434
}
@@ -38,19 +38,20 @@ TEST(ScudoReleaseTest, PackedCounterArray) {
3838
// Make sure counters request one memory page for the buffer.
3939
const scudo::uptr NumCounters =
4040
(scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
41-
scudo::PackedCounterArray Counters(NumCounters, 1UL << ((1UL << I) - 1));
42-
Counters.inc(0);
41+
scudo::PackedCounterArray Counters(1U, NumCounters,
42+
1UL << ((1UL << I) - 1));
43+
Counters.inc(0U, 0U);
4344
for (scudo::uptr C = 1; C < NumCounters - 1; C++) {
44-
EXPECT_EQ(0UL, Counters.get(C));
45-
Counters.inc(C);
46-
EXPECT_EQ(1UL, Counters.get(C - 1));
45+
EXPECT_EQ(0UL, Counters.get(0U, C));
46+
Counters.inc(0U, C);
47+
EXPECT_EQ(1UL, Counters.get(0U, C - 1));
4748
}
48-
EXPECT_EQ(0UL, Counters.get(NumCounters - 1));
49-
Counters.inc(NumCounters - 1);
49+
EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1));
50+
Counters.inc(0U, NumCounters - 1);
5051
if (I > 0) {
51-
Counters.incRange(0, NumCounters - 1);
52+
Counters.incRange(0u, 0U, NumCounters - 1);
5253
for (scudo::uptr C = 0; C < NumCounters; C++)
53-
EXPECT_EQ(2UL, Counters.get(C));
54+
EXPECT_EQ(2UL, Counters.get(0U, C));
5455
}
5556
}
5657
}
@@ -190,7 +191,7 @@ template <class SizeClassMap> void testReleaseFreeMemoryToOS() {
190191

191192
// Release the memory.
192193
ReleasedPagesRecorder Recorder;
193-
releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, BlockSize,
194+
releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize,
194195
&Recorder);
195196

196197
// Verify that there are no released pages touched by used chunks and all

0 commit comments

Comments
 (0)