Skip to content

Commit 1a4d760

Browse files
committed
percpu: implement asynchronous chunk population
The percpu allocator now supports atomic allocations by only allocating from already populated areas but the mechanism to ensure that there's adequate amount of populated areas was missing. This patch expands pcpu_balance_work so that in addition to freeing excess free chunks it also populates chunks to maintain an adequate level of populated areas. pcpu_alloc() schedules pcpu_balance_work if the amount of free populated areas is too low or after an atomic allocation failure. * PERPCU_DYNAMIC_RESERVE is increased by two pages to account for PCPU_EMPTY_POP_PAGES_LOW. * pcpu_async_enabled is added to gate both async jobs - chunk->map_extend_work and pcpu_balance_work - so that we don't end up scheduling them while the needed subsystems aren't up yet. Signed-off-by: Tejun Heo <[email protected]>
1 parent fe6bd8c commit 1a4d760

File tree

2 files changed

+115
-6
lines changed

2 files changed

+115
-6
lines changed

include/linux/percpu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@
4848
* intelligent way to determine this would be nice.
4949
*/
5050
#if BITS_PER_LONG > 32
51-
#define PERCPU_DYNAMIC_RESERVE (20 << 10)
51+
#define PERCPU_DYNAMIC_RESERVE (28 << 10)
5252
#else
53-
#define PERCPU_DYNAMIC_RESERVE (12 << 10)
53+
#define PERCPU_DYNAMIC_RESERVE (20 << 10)
5454
#endif
5555

5656
extern void *pcpu_base_addr;

mm/percpu.c

Lines changed: 113 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@
7878
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
7979
#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
8080
#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81+
#define PCPU_EMPTY_POP_PAGES_LOW 2
82+
#define PCPU_EMPTY_POP_PAGES_HIGH 4
8183

8284
#ifdef CONFIG_SMP
8385
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
168170
*/
169171
static int pcpu_nr_empty_pop_pages;
170172

171-
/* balance work is used to populate or destroy chunks asynchronously */
173+
/*
174+
* Balance work is used to populate or destroy chunks asynchronously. We
175+
* try to keep the number of populated free pages between
176+
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
177+
* empty chunk.
178+
*/
172179
static void pcpu_balance_workfn(struct work_struct *work);
173180
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181+
static bool pcpu_async_enabled __read_mostly;
182+
static bool pcpu_atomic_alloc_failed;
183+
184+
static void pcpu_schedule_balance_work(void)
185+
{
186+
if (pcpu_async_enabled)
187+
schedule_work(&pcpu_balance_work);
188+
}
174189

175190
static bool pcpu_addr_in_first_chunk(void *addr)
176191
{
@@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
386401
margin = 3;
387402

388403
if (chunk->map_alloc <
389-
chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW)
404+
chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405+
pcpu_async_enabled)
390406
schedule_work(&chunk->map_extend_work);
391407
} else {
392408
margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
@@ -1005,6 +1021,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
10051021
if (chunk != pcpu_reserved_chunk)
10061022
pcpu_nr_empty_pop_pages -= occ_pages;
10071023

1024+
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1025+
pcpu_schedule_balance_work();
1026+
10081027
/* clear the areas and return address relative to base address */
10091028
for_each_possible_cpu(cpu)
10101029
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
@@ -1023,6 +1042,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
10231042
if (!--warn_limit)
10241043
pr_info("PERCPU: limit reached, disable warning\n");
10251044
}
1045+
if (is_atomic) {
1046+
/* see the flag handling in pcpu_blance_workfn() */
1047+
pcpu_atomic_alloc_failed = true;
1048+
pcpu_schedule_balance_work();
1049+
}
10261050
return NULL;
10271051
}
10281052

@@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
10801104
}
10811105

10821106
/**
1083-
* pcpu_balance_workfn - reclaim fully free chunks, workqueue function
1107+
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
10841108
* @work: unused
10851109
*
10861110
* Reclaim all fully free chunks except for the first one.
@@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
10901114
LIST_HEAD(to_free);
10911115
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
10921116
struct pcpu_chunk *chunk, *next;
1117+
int slot, nr_to_pop, ret;
10931118

1119+
/*
1120+
* There's no reason to keep around multiple unused chunks and VM
1121+
* areas can be scarce. Destroy all free chunks except for one.
1122+
*/
10941123
mutex_lock(&pcpu_alloc_mutex);
10951124
spin_lock_irq(&pcpu_lock);
10961125

@@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work)
11181147
pcpu_destroy_chunk(chunk);
11191148
}
11201149

1150+
/*
1151+
* Ensure there are certain number of free populated pages for
1152+
* atomic allocs. Fill up from the most packed so that atomic
1153+
* allocs don't increase fragmentation. If atomic allocation
1154+
* failed previously, always populate the maximum amount. This
1155+
* should prevent atomic allocs larger than PAGE_SIZE from keeping
1156+
* failing indefinitely; however, large atomic allocs are not
1157+
* something we support properly and can be highly unreliable and
1158+
* inefficient.
1159+
*/
1160+
retry_pop:
1161+
if (pcpu_atomic_alloc_failed) {
1162+
nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1163+
/* best effort anyway, don't worry about synchronization */
1164+
pcpu_atomic_alloc_failed = false;
1165+
} else {
1166+
nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1167+
pcpu_nr_empty_pop_pages,
1168+
0, PCPU_EMPTY_POP_PAGES_HIGH);
1169+
}
1170+
1171+
for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1172+
int nr_unpop = 0, rs, re;
1173+
1174+
if (!nr_to_pop)
1175+
break;
1176+
1177+
spin_lock_irq(&pcpu_lock);
1178+
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1179+
nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1180+
if (nr_unpop)
1181+
break;
1182+
}
1183+
spin_unlock_irq(&pcpu_lock);
1184+
1185+
if (!nr_unpop)
1186+
continue;
1187+
1188+
/* @chunk can't go away while pcpu_alloc_mutex is held */
1189+
pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1190+
int nr = min(re - rs, nr_to_pop);
1191+
1192+
ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1193+
if (!ret) {
1194+
nr_to_pop -= nr;
1195+
spin_lock_irq(&pcpu_lock);
1196+
pcpu_chunk_populated(chunk, rs, rs + nr);
1197+
spin_unlock_irq(&pcpu_lock);
1198+
} else {
1199+
nr_to_pop = 0;
1200+
}
1201+
1202+
if (!nr_to_pop)
1203+
break;
1204+
}
1205+
}
1206+
1207+
if (nr_to_pop) {
1208+
/* ran out of chunks to populate, create a new one and retry */
1209+
chunk = pcpu_create_chunk();
1210+
if (chunk) {
1211+
spin_lock_irq(&pcpu_lock);
1212+
pcpu_chunk_relocate(chunk, -1);
1213+
spin_unlock_irq(&pcpu_lock);
1214+
goto retry_pop;
1215+
}
1216+
}
1217+
11211218
mutex_unlock(&pcpu_alloc_mutex);
11221219
}
11231220

@@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr)
11601257

11611258
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
11621259
if (pos != chunk) {
1163-
schedule_work(&pcpu_balance_work);
1260+
pcpu_schedule_balance_work();
11641261
break;
11651262
}
11661263
}
@@ -2187,3 +2284,15 @@ void __init percpu_init_late(void)
21872284
spin_unlock_irqrestore(&pcpu_lock, flags);
21882285
}
21892286
}
2287+
2288+
/*
2289+
* Percpu allocator is initialized early during boot when neither slab or
2290+
* workqueue is available. Plug async management until everything is up
2291+
* and running.
2292+
*/
2293+
static int __init percpu_enable_async(void)
2294+
{
2295+
pcpu_async_enabled = true;
2296+
return 0;
2297+
}
2298+
subsys_initcall(percpu_enable_async);

0 commit comments

Comments
 (0)