78
78
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
79
79
#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80
80
#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81
+ #define PCPU_EMPTY_POP_PAGES_LOW 2
82
+ #define PCPU_EMPTY_POP_PAGES_HIGH 4
81
83
82
84
#ifdef CONFIG_SMP
83
85
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
168
170
*/
169
171
static int pcpu_nr_empty_pop_pages ;
170
172
171
- /* balance work is used to populate or destroy chunks asynchronously */
173
+ /*
174
+ * Balance work is used to populate or destroy chunks asynchronously. We
175
+ * try to keep the number of populated free pages between
176
+ * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
177
+ * empty chunk.
178
+ */
172
179
static void pcpu_balance_workfn (struct work_struct * work );
173
180
static DECLARE_WORK (pcpu_balance_work , pcpu_balance_workfn );
181
+ static bool pcpu_async_enabled __read_mostly ;
182
+ static bool pcpu_atomic_alloc_failed ;
183
+
184
+ static void pcpu_schedule_balance_work (void )
185
+ {
186
+ if (pcpu_async_enabled )
187
+ schedule_work (& pcpu_balance_work );
188
+ }
174
189
175
190
static bool pcpu_addr_in_first_chunk (void * addr )
176
191
{
@@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
386
401
margin = 3 ;
387
402
388
403
if (chunk -> map_alloc <
389
- chunk -> map_used + PCPU_ATOMIC_MAP_MARGIN_LOW )
404
+ chunk -> map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405
+ pcpu_async_enabled )
390
406
schedule_work (& chunk -> map_extend_work );
391
407
} else {
392
408
margin = PCPU_ATOMIC_MAP_MARGIN_HIGH ;
@@ -1005,6 +1021,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
1005
1021
if (chunk != pcpu_reserved_chunk )
1006
1022
pcpu_nr_empty_pop_pages -= occ_pages ;
1007
1023
1024
+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW )
1025
+ pcpu_schedule_balance_work ();
1026
+
1008
1027
/* clear the areas and return address relative to base address */
1009
1028
for_each_possible_cpu (cpu )
1010
1029
memset ((void * )pcpu_chunk_addr (chunk , cpu , 0 ) + off , 0 , size );
@@ -1023,6 +1042,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
1023
1042
if (!-- warn_limit )
1024
1043
pr_info ("PERCPU: limit reached, disable warning\n" );
1025
1044
}
1045
+ if (is_atomic ) {
1046
+ /* see the flag handling in pcpu_blance_workfn() */
1047
+ pcpu_atomic_alloc_failed = true;
1048
+ pcpu_schedule_balance_work ();
1049
+ }
1026
1050
return NULL ;
1027
1051
}
1028
1052
@@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1080
1104
}
1081
1105
1082
1106
/**
1083
- * pcpu_balance_workfn - reclaim fully free chunks, workqueue function
1107
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
1084
1108
* @work: unused
1085
1109
*
1086
1110
* Reclaim all fully free chunks except for the first one.
@@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
1090
1114
LIST_HEAD (to_free );
1091
1115
struct list_head * free_head = & pcpu_slot [pcpu_nr_slots - 1 ];
1092
1116
struct pcpu_chunk * chunk , * next ;
1117
+ int slot , nr_to_pop , ret ;
1093
1118
1119
+ /*
1120
+ * There's no reason to keep around multiple unused chunks and VM
1121
+ * areas can be scarce. Destroy all free chunks except for one.
1122
+ */
1094
1123
mutex_lock (& pcpu_alloc_mutex );
1095
1124
spin_lock_irq (& pcpu_lock );
1096
1125
@@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work)
1118
1147
pcpu_destroy_chunk (chunk );
1119
1148
}
1120
1149
1150
+ /*
1151
+ * Ensure there are certain number of free populated pages for
1152
+ * atomic allocs. Fill up from the most packed so that atomic
1153
+ * allocs don't increase fragmentation. If atomic allocation
1154
+ * failed previously, always populate the maximum amount. This
1155
+ * should prevent atomic allocs larger than PAGE_SIZE from keeping
1156
+ * failing indefinitely; however, large atomic allocs are not
1157
+ * something we support properly and can be highly unreliable and
1158
+ * inefficient.
1159
+ */
1160
+ retry_pop :
1161
+ if (pcpu_atomic_alloc_failed ) {
1162
+ nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH ;
1163
+ /* best effort anyway, don't worry about synchronization */
1164
+ pcpu_atomic_alloc_failed = false;
1165
+ } else {
1166
+ nr_to_pop = clamp (PCPU_EMPTY_POP_PAGES_HIGH -
1167
+ pcpu_nr_empty_pop_pages ,
1168
+ 0 , PCPU_EMPTY_POP_PAGES_HIGH );
1169
+ }
1170
+
1171
+ for (slot = pcpu_size_to_slot (PAGE_SIZE ); slot < pcpu_nr_slots ; slot ++ ) {
1172
+ int nr_unpop = 0 , rs , re ;
1173
+
1174
+ if (!nr_to_pop )
1175
+ break ;
1176
+
1177
+ spin_lock_irq (& pcpu_lock );
1178
+ list_for_each_entry (chunk , & pcpu_slot [slot ], list ) {
1179
+ nr_unpop = pcpu_unit_pages - chunk -> nr_populated ;
1180
+ if (nr_unpop )
1181
+ break ;
1182
+ }
1183
+ spin_unlock_irq (& pcpu_lock );
1184
+
1185
+ if (!nr_unpop )
1186
+ continue ;
1187
+
1188
+ /* @chunk can't go away while pcpu_alloc_mutex is held */
1189
+ pcpu_for_each_unpop_region (chunk , rs , re , 0 , pcpu_unit_pages ) {
1190
+ int nr = min (re - rs , nr_to_pop );
1191
+
1192
+ ret = pcpu_populate_chunk (chunk , rs , rs + nr );
1193
+ if (!ret ) {
1194
+ nr_to_pop -= nr ;
1195
+ spin_lock_irq (& pcpu_lock );
1196
+ pcpu_chunk_populated (chunk , rs , rs + nr );
1197
+ spin_unlock_irq (& pcpu_lock );
1198
+ } else {
1199
+ nr_to_pop = 0 ;
1200
+ }
1201
+
1202
+ if (!nr_to_pop )
1203
+ break ;
1204
+ }
1205
+ }
1206
+
1207
+ if (nr_to_pop ) {
1208
+ /* ran out of chunks to populate, create a new one and retry */
1209
+ chunk = pcpu_create_chunk ();
1210
+ if (chunk ) {
1211
+ spin_lock_irq (& pcpu_lock );
1212
+ pcpu_chunk_relocate (chunk , -1 );
1213
+ spin_unlock_irq (& pcpu_lock );
1214
+ goto retry_pop ;
1215
+ }
1216
+ }
1217
+
1121
1218
mutex_unlock (& pcpu_alloc_mutex );
1122
1219
}
1123
1220
@@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr)
1160
1257
1161
1258
list_for_each_entry (pos , & pcpu_slot [pcpu_nr_slots - 1 ], list )
1162
1259
if (pos != chunk ) {
1163
- schedule_work ( & pcpu_balance_work );
1260
+ pcpu_schedule_balance_work ( );
1164
1261
break ;
1165
1262
}
1166
1263
}
@@ -2187,3 +2284,15 @@ void __init percpu_init_late(void)
2187
2284
spin_unlock_irqrestore (& pcpu_lock , flags );
2188
2285
}
2189
2286
}
2287
+
2288
+ /*
2289
+ * Percpu allocator is initialized early during boot when neither slab or
2290
+ * workqueue is available. Plug async management until everything is up
2291
+ * and running.
2292
+ */
2293
+ static int __init percpu_enable_async (void )
2294
+ {
2295
+ pcpu_async_enabled = true;
2296
+ return 0 ;
2297
+ }
2298
+ subsys_initcall (percpu_enable_async );
0 commit comments