Skip to content

Commit b3c03db

Browse files
committed
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "10 fixes" * emailed patches from Andrew Morton <[email protected]>: x86/mm: split vmalloc_sync_all() mm, slub: prevent kmalloc_node crashes and memory leaks mm/mmu_notifier: silence PROVE_RCU_LIST warnings epoll: fix possible lost wakeup on epoll_ctl() path mm: do not allow MADV_PAGEOUT for CoW pages mm, memcg: throttle allocators based on ancestral memory.high mm, memcg: fix corruption on 64-bit divisor in memory.high throttling page-flags: fix a crash at SetPageError(THP_SWAP) mm/hotplug: fix hot remove failure in SPARSEMEM|!VMEMMAP case memcg: fix NULL pointer dereference in __mem_cgroup_usage_unregister_event
2 parents b74b991 + 763802b commit b3c03db

File tree

13 files changed

+164
-78
lines changed

13 files changed

+164
-78
lines changed

arch/x86/mm/fault.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
190190
return pmd_k;
191191
}
192192

193-
void vmalloc_sync_all(void)
193+
static void vmalloc_sync(void)
194194
{
195195
unsigned long address;
196196

@@ -217,6 +217,16 @@ void vmalloc_sync_all(void)
217217
}
218218
}
219219

220+
void vmalloc_sync_mappings(void)
221+
{
222+
vmalloc_sync();
223+
}
224+
225+
void vmalloc_sync_unmappings(void)
226+
{
227+
vmalloc_sync();
228+
}
229+
220230
/*
221231
* 32-bit:
222232
*
@@ -319,11 +329,23 @@ static void dump_pagetable(unsigned long address)
319329

320330
#else /* CONFIG_X86_64: */
321331

322-
void vmalloc_sync_all(void)
332+
void vmalloc_sync_mappings(void)
323333
{
334+
/*
335+
* 64-bit mappings might allocate new p4d/pud pages
336+
* that need to be propagated to all tasks' PGDs.
337+
*/
324338
sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
325339
}
326340

341+
void vmalloc_sync_unmappings(void)
342+
{
343+
/*
344+
* Unmappings never allocate or free p4d/pud pages.
345+
* No work is required here.
346+
*/
347+
}
348+
327349
/*
328350
* 64-bit:
329351
*

drivers/acpi/apei/ghes.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ int ghes_estatus_pool_init(int num_ghes)
171171
* New allocation must be visible in all pgd before it can be found by
172172
* an NMI allocating from the pool.
173173
*/
174-
vmalloc_sync_all();
174+
vmalloc_sync_mappings();
175175

176176
rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1);
177177
if (rc)

fs/eventpoll.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1854,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
18541854
waiter = true;
18551855
init_waitqueue_entry(&wait, current);
18561856

1857-
spin_lock_irq(&ep->wq.lock);
1857+
write_lock_irq(&ep->lock);
18581858
__add_wait_queue_exclusive(&ep->wq, &wait);
1859-
spin_unlock_irq(&ep->wq.lock);
1859+
write_unlock_irq(&ep->lock);
18601860
}
18611861

18621862
for (;;) {
@@ -1904,9 +1904,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
19041904
goto fetch_events;
19051905

19061906
if (waiter) {
1907-
spin_lock_irq(&ep->wq.lock);
1907+
write_lock_irq(&ep->lock);
19081908
__remove_wait_queue(&ep->wq, &wait);
1909-
spin_unlock_irq(&ep->wq.lock);
1909+
write_unlock_irq(&ep->lock);
19101910
}
19111911

19121912
return res;

include/linux/page-flags.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
311311

312312
__PAGEFLAG(Locked, locked, PF_NO_TAIL)
313313
PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
314-
PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
314+
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
315315
PAGEFLAG(Referenced, referenced, PF_HEAD)
316316
TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
317317
__SETPAGEFLAG(Referenced, referenced, PF_HEAD)

include/linux/vmalloc.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
141141

142142
extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
143143
unsigned long pgoff);
144-
void vmalloc_sync_all(void);
145-
144+
void vmalloc_sync_mappings(void);
145+
void vmalloc_sync_unmappings(void);
146+
146147
/*
147148
* Lowlevel-APIs (not for driver use!)
148149
*/

kernel/notifier.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die);
519519

520520
int register_die_notifier(struct notifier_block *nb)
521521
{
522-
vmalloc_sync_all();
522+
vmalloc_sync_mappings();
523523
return atomic_notifier_chain_register(&die_chain, nb);
524524
}
525525
EXPORT_SYMBOL_GPL(register_die_notifier);

mm/madvise.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -335,12 +335,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
335335
}
336336

337337
page = pmd_page(orig_pmd);
338+
339+
/* Do not interfere with other mappings of this page */
340+
if (page_mapcount(page) != 1)
341+
goto huge_unlock;
342+
338343
if (next - addr != HPAGE_PMD_SIZE) {
339344
int err;
340345

341-
if (page_mapcount(page) != 1)
342-
goto huge_unlock;
343-
344346
get_page(page);
345347
spin_unlock(ptl);
346348
lock_page(page);
@@ -426,6 +428,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
426428
continue;
427429
}
428430

431+
/* Do not interfere with other mappings of this page */
432+
if (page_mapcount(page) != 1)
433+
continue;
434+
429435
VM_BUG_ON_PAGE(PageTransCompound(page), page);
430436

431437
if (pte_young(ptent)) {

mm/memcontrol.c

Lines changed: 66 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,53 +2297,51 @@ static void high_work_func(struct work_struct *work)
22972297
#define MEMCG_DELAY_SCALING_SHIFT 14
22982298

22992299
/*
2300-
* Scheduled by try_charge() to be executed from the userland return path
2301-
* and reclaims memory over the high limit.
2300+
* Get the number of jiffies that we should penalise a mischievous cgroup which
2301+
* is exceeding its memory.high by checking both it and its ancestors.
23022302
*/
2303-
void mem_cgroup_handle_over_high(void)
2303+
static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2304+
unsigned int nr_pages)
23042305
{
2305-
unsigned long usage, high, clamped_high;
2306-
unsigned long pflags;
2307-
unsigned long penalty_jiffies, overage;
2308-
unsigned int nr_pages = current->memcg_nr_pages_over_high;
2309-
struct mem_cgroup *memcg;
2306+
unsigned long penalty_jiffies;
2307+
u64 max_overage = 0;
23102308

2311-
if (likely(!nr_pages))
2312-
return;
2309+
do {
2310+
unsigned long usage, high;
2311+
u64 overage;
23132312

2314-
memcg = get_mem_cgroup_from_mm(current->mm);
2315-
reclaim_high(memcg, nr_pages, GFP_KERNEL);
2316-
current->memcg_nr_pages_over_high = 0;
2313+
usage = page_counter_read(&memcg->memory);
2314+
high = READ_ONCE(memcg->high);
2315+
2316+
/*
2317+
* Prevent division by 0 in overage calculation by acting as if
2318+
* it was a threshold of 1 page
2319+
*/
2320+
high = max(high, 1UL);
2321+
2322+
overage = usage - high;
2323+
overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2324+
overage = div64_u64(overage, high);
2325+
2326+
if (overage > max_overage)
2327+
max_overage = overage;
2328+
} while ((memcg = parent_mem_cgroup(memcg)) &&
2329+
!mem_cgroup_is_root(memcg));
2330+
2331+
if (!max_overage)
2332+
return 0;
23172333

23182334
/*
2319-
* memory.high is breached and reclaim is unable to keep up. Throttle
2320-
* allocators proactively to slow down excessive growth.
2321-
*
23222335
* We use overage compared to memory.high to calculate the number of
23232336
* jiffies to sleep (penalty_jiffies). Ideally this value should be
23242337
* fairly lenient on small overages, and increasingly harsh when the
23252338
* memcg in question makes it clear that it has no intention of stopping
23262339
* its crazy behaviour, so we exponentially increase the delay based on
23272340
* overage amount.
23282341
*/
2329-
2330-
usage = page_counter_read(&memcg->memory);
2331-
high = READ_ONCE(memcg->high);
2332-
2333-
if (usage <= high)
2334-
goto out;
2335-
2336-
/*
2337-
* Prevent division by 0 in overage calculation by acting as if it was a
2338-
* threshold of 1 page
2339-
*/
2340-
clamped_high = max(high, 1UL);
2341-
2342-
overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
2343-
clamped_high);
2344-
2345-
penalty_jiffies = ((u64)overage * overage * HZ)
2346-
>> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
2342+
penalty_jiffies = max_overage * max_overage * HZ;
2343+
penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2344+
penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
23472345

23482346
/*
23492347
* Factor in the task's own contribution to the overage, such that four
@@ -2360,7 +2358,32 @@ void mem_cgroup_handle_over_high(void)
23602358
* application moving forwards and also permit diagnostics, albeit
23612359
* extremely slowly.
23622360
*/
2363-
penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2361+
return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2362+
}
2363+
2364+
/*
2365+
* Scheduled by try_charge() to be executed from the userland return path
2366+
* and reclaims memory over the high limit.
2367+
*/
2368+
void mem_cgroup_handle_over_high(void)
2369+
{
2370+
unsigned long penalty_jiffies;
2371+
unsigned long pflags;
2372+
unsigned int nr_pages = current->memcg_nr_pages_over_high;
2373+
struct mem_cgroup *memcg;
2374+
2375+
if (likely(!nr_pages))
2376+
return;
2377+
2378+
memcg = get_mem_cgroup_from_mm(current->mm);
2379+
reclaim_high(memcg, nr_pages, GFP_KERNEL);
2380+
current->memcg_nr_pages_over_high = 0;
2381+
2382+
/*
2383+
* memory.high is breached and reclaim is unable to keep up. Throttle
2384+
* allocators proactively to slow down excessive growth.
2385+
*/
2386+
penalty_jiffies = calculate_high_delay(memcg, nr_pages);
23642387

23652388
/*
23662389
* Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -4027,7 +4050,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
40274050
struct mem_cgroup_thresholds *thresholds;
40284051
struct mem_cgroup_threshold_ary *new;
40294052
unsigned long usage;
4030-
int i, j, size;
4053+
int i, j, size, entries;
40314054

40324055
mutex_lock(&memcg->thresholds_lock);
40334056

@@ -4047,14 +4070,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
40474070
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
40484071

40494072
/* Calculate new number of threshold */
4050-
size = 0;
4073+
size = entries = 0;
40514074
for (i = 0; i < thresholds->primary->size; i++) {
40524075
if (thresholds->primary->entries[i].eventfd != eventfd)
40534076
size++;
4077+
else
4078+
entries++;
40544079
}
40554080

40564081
new = thresholds->spare;
40574082

4083+
/* If no items related to eventfd have been cleared, nothing to do */
4084+
if (!entries)
4085+
goto unlock;
4086+
40584087
/* Set thresholds array to NULL if we don't have thresholds */
40594088
if (!size) {
40604089
kfree(new);

mm/mmu_notifier.c

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,8 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
307307
* ->release returns.
308308
*/
309309
id = srcu_read_lock(&srcu);
310-
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist)
310+
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
311+
srcu_read_lock_held(&srcu))
311312
/*
312313
* If ->release runs before mmu_notifier_unregister it must be
313314
* handled, as it's the only way for the driver to flush all
@@ -370,7 +371,8 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
370371

371372
id = srcu_read_lock(&srcu);
372373
hlist_for_each_entry_rcu(subscription,
373-
&mm->notifier_subscriptions->list, hlist) {
374+
&mm->notifier_subscriptions->list, hlist,
375+
srcu_read_lock_held(&srcu)) {
374376
if (subscription->ops->clear_flush_young)
375377
young |= subscription->ops->clear_flush_young(
376378
subscription, mm, start, end);
@@ -389,7 +391,8 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
389391

390392
id = srcu_read_lock(&srcu);
391393
hlist_for_each_entry_rcu(subscription,
392-
&mm->notifier_subscriptions->list, hlist) {
394+
&mm->notifier_subscriptions->list, hlist,
395+
srcu_read_lock_held(&srcu)) {
393396
if (subscription->ops->clear_young)
394397
young |= subscription->ops->clear_young(subscription,
395398
mm, start, end);
@@ -407,7 +410,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
407410

408411
id = srcu_read_lock(&srcu);
409412
hlist_for_each_entry_rcu(subscription,
410-
&mm->notifier_subscriptions->list, hlist) {
413+
&mm->notifier_subscriptions->list, hlist,
414+
srcu_read_lock_held(&srcu)) {
411415
if (subscription->ops->test_young) {
412416
young = subscription->ops->test_young(subscription, mm,
413417
address);
@@ -428,7 +432,8 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
428432

429433
id = srcu_read_lock(&srcu);
430434
hlist_for_each_entry_rcu(subscription,
431-
&mm->notifier_subscriptions->list, hlist) {
435+
&mm->notifier_subscriptions->list, hlist,
436+
srcu_read_lock_held(&srcu)) {
432437
if (subscription->ops->change_pte)
433438
subscription->ops->change_pte(subscription, mm, address,
434439
pte);
@@ -476,7 +481,8 @@ static int mn_hlist_invalidate_range_start(
476481
int id;
477482

478483
id = srcu_read_lock(&srcu);
479-
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
484+
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
485+
srcu_read_lock_held(&srcu)) {
480486
const struct mmu_notifier_ops *ops = subscription->ops;
481487

482488
if (ops->invalidate_range_start) {
@@ -528,7 +534,8 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
528534
int id;
529535

530536
id = srcu_read_lock(&srcu);
531-
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
537+
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
538+
srcu_read_lock_held(&srcu)) {
532539
/*
533540
* Call invalidate_range here too to avoid the need for the
534541
* subsystem of having to register an invalidate_range_end
@@ -582,7 +589,8 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
582589

583590
id = srcu_read_lock(&srcu);
584591
hlist_for_each_entry_rcu(subscription,
585-
&mm->notifier_subscriptions->list, hlist) {
592+
&mm->notifier_subscriptions->list, hlist,
593+
srcu_read_lock_held(&srcu)) {
586594
if (subscription->ops->invalidate_range)
587595
subscription->ops->invalidate_range(subscription, mm,
588596
start, end);
@@ -714,7 +722,8 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
714722

715723
spin_lock(&mm->notifier_subscriptions->lock);
716724
hlist_for_each_entry_rcu(subscription,
717-
&mm->notifier_subscriptions->list, hlist) {
725+
&mm->notifier_subscriptions->list, hlist,
726+
lockdep_is_held(&mm->notifier_subscriptions->lock)) {
718727
if (subscription->ops != ops)
719728
continue;
720729

0 commit comments

Comments
 (0)