Skip to content

Commit 5df397d

Browse files
torvaldsakpm00
authored andcommitted
mm: delay page_remove_rmap() until after the TLB has been flushed
When we remove a page table entry, we are very careful to only free the page after we have flushed the TLB, because other CPUs could still be using the page through stale TLB entries until after the flush. However, we have removed the rmap entry for that page early, which means that functions like folio_mkclean() would end up not serializing with the page table lock because the page had already been made invisible to rmap. And that is a problem, because while the TLB entry exists, we could end up with the following situation: (a) one CPU could come in and clean it, never seeing our mapping of the page (b) another CPU could continue to use the stale and dirty TLB entry and continue to write to said page resulting in a page that has been dirtied, but then marked clean again, all while another CPU might have dirtied it some more. End result: possibly lost dirty data. This extends our current TLB gather infrastructure to optionally track a "should I do a delayed page_remove_rmap() for this page after flushing the TLB". It uses the newly introduced 'encoded page pointer' to do that without having to keep separate data around. Note, this is complicated by a couple of issues: - we want to delay the rmap removal, but not past the page table lock, because that simplifies the memcg accounting - only SMP configurations want to delay TLB flushing, since on UP there are obviously no remote TLBs to worry about, and the page table lock means there are no preemption issues either - s390 has its own mmu_gather model that doesn't delay TLB flushing, and as a result also does not want the delayed rmap. As such, we can treat S390 like the UP case and use a common fallback for the "no delays" case. - we can track an enormous number of pages in our mmu_gather structure, with MAX_GATHER_BATCH_COUNT batches of MAX_TABLE_BATCH pages each, all set up to be approximately 10k pending pages. We do not want to have a huge number of batched pages that we then need to check for delayed rmap handling inside the page table lock. Particularly that last point results in a noteworthy detail, where the normal page batch gathering is limited once we have delayed rmaps pending, in such a way that only the last batch (the so-called "active batch") in the mmu_gather structure can have any delayed entries. NOTE! While the "possibly lost dirty data" sounds catastrophic, for this all to happen you need to have a user thread doing either madvise() with MADV_DONTNEED or a full re-mmap() of the area concurrently with another thread continuing to use said mapping. So arguably this is about user space doing crazy things, but from a VM consistency standpoint it's better if we track the dirty bit properly even when user space goes off the rails. [[email protected]: fix UP build, per Linus] Link: https://lore.kernel.org/all/[email protected]/ Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]> Acked-by: Johannes Weiner <[email protected]> Acked-by: Hugh Dickins <[email protected]> Reported-by: Nadav Amit <[email protected]> Tested-by: Nadav Amit <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 7cc8f9c commit 5df397d

File tree

4 files changed

+82
-8
lines changed

4 files changed

+82
-8
lines changed

arch/s390/include/asm/tlb.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
4141
* Release the page cache reference for a pte removed by
4242
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
4343
* has already been freed, so just do free_page_and_swap_cache.
44+
*
45+
* s390 doesn't delay rmap removal, so there is nothing encoded in
46+
* the page pointer.
4447
*/
4548
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
4649
struct encoded_page *page,

include/asm-generic/tlb.h

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,28 @@ struct mmu_gather_batch {
263263
extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
264264
struct encoded_page *page,
265265
int page_size);
266+
267+
#ifdef CONFIG_SMP
268+
/*
269+
* This both sets 'delayed_rmap', and returns true. It would be an inline
270+
* function, except we define it before the 'struct mmu_gather'.
271+
*/
272+
#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
273+
extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
274+
#endif
275+
276+
#endif
277+
278+
/*
279+
* We have a no-op version of the rmap removal that doesn't
280+
* delay anything. That is used on S390, which flushes remote
281+
* TLBs synchronously, and on UP, which doesn't have any
282+
* remote TLBs to flush and is not preemptible due to this
283+
* all happening under the page table lock.
284+
*/
285+
#ifndef tlb_delay_rmap
286+
#define tlb_delay_rmap(tlb) (false)
287+
static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
266288
#endif
267289

268290
/*
@@ -295,6 +317,11 @@ struct mmu_gather {
295317
*/
296318
unsigned int freed_tables : 1;
297319

320+
/*
321+
* Do we have pending delayed rmap removals?
322+
*/
323+
unsigned int delayed_rmap : 1;
324+
298325
/*
299326
* at which levels have we cleared entries?
300327
*/
@@ -440,9 +467,9 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
440467
tlb_flush_mmu(tlb);
441468
}
442469

443-
static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
470+
static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags)
444471
{
445-
return __tlb_remove_page_size(tlb, encode_page(page, 0), PAGE_SIZE);
472+
return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE);
446473
}
447474

448475
/* tlb_remove_page

mm/memory.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,6 +1374,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
13741374
break;
13751375

13761376
if (pte_present(ptent)) {
1377+
unsigned int delay_rmap;
1378+
13771379
page = vm_normal_page(vma, addr, ptent);
13781380
if (unlikely(!should_zap_page(details, page)))
13791381
continue;
@@ -1385,20 +1387,26 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
13851387
if (unlikely(!page))
13861388
continue;
13871389

1390+
delay_rmap = 0;
13881391
if (!PageAnon(page)) {
13891392
if (pte_dirty(ptent)) {
1390-
force_flush = 1;
13911393
set_page_dirty(page);
1394+
if (tlb_delay_rmap(tlb)) {
1395+
delay_rmap = 1;
1396+
force_flush = 1;
1397+
}
13921398
}
13931399
if (pte_young(ptent) &&
13941400
likely(!(vma->vm_flags & VM_SEQ_READ)))
13951401
mark_page_accessed(page);
13961402
}
13971403
rss[mm_counter(page)]--;
1398-
page_remove_rmap(page, vma, false);
1399-
if (unlikely(page_mapcount(page) < 0))
1400-
print_bad_pte(vma, addr, ptent, page);
1401-
if (unlikely(__tlb_remove_page(tlb, page))) {
1404+
if (!delay_rmap) {
1405+
page_remove_rmap(page, vma, false);
1406+
if (unlikely(page_mapcount(page) < 0))
1407+
print_bad_pte(vma, addr, ptent, page);
1408+
}
1409+
if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
14021410
force_flush = 1;
14031411
addr += PAGE_SIZE;
14041412
break;
@@ -1455,8 +1463,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
14551463
arch_leave_lazy_mmu_mode();
14561464

14571465
/* Do the actual TLB flush before dropping ptl */
1458-
if (force_flush)
1466+
if (force_flush) {
14591467
tlb_flush_mmu_tlbonly(tlb);
1468+
if (tlb->delayed_rmap)
1469+
tlb_flush_rmaps(tlb, vma);
1470+
}
14601471
pte_unmap_unlock(start_pte, ptl);
14611472

14621473
/*

mm/mmu_gather.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/rcupdate.h>
1010
#include <linux/smp.h>
1111
#include <linux/swap.h>
12+
#include <linux/rmap.h>
1213

1314
#include <asm/pgalloc.h>
1415
#include <asm/tlb.h>
@@ -19,6 +20,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
1920
{
2021
struct mmu_gather_batch *batch;
2122

23+
/* No more batching if we have delayed rmaps pending */
24+
if (tlb->delayed_rmap)
25+
return false;
26+
2227
batch = tlb->active;
2328
if (batch->next) {
2429
tlb->active = batch->next;
@@ -43,6 +48,33 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
4348
return true;
4449
}
4550

51+
#ifdef CONFIG_SMP
52+
/**
53+
* tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
54+
* @tlb: the current mmu_gather
55+
*
56+
* Note that because of how tlb_next_batch() above works, we will
57+
* never start new batches with pending delayed rmaps, so we only
58+
* need to walk through the current active batch.
59+
*/
60+
void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
61+
{
62+
struct mmu_gather_batch *batch;
63+
64+
batch = tlb->active;
65+
for (int i = 0; i < batch->nr; i++) {
66+
struct encoded_page *enc = batch->encoded_pages[i];
67+
68+
if (encoded_page_flags(enc)) {
69+
struct page *page = encoded_page_ptr(enc);
70+
page_remove_rmap(page, vma, false);
71+
}
72+
}
73+
74+
tlb->delayed_rmap = 0;
75+
}
76+
#endif
77+
4678
static void tlb_batch_pages_flush(struct mmu_gather *tlb)
4779
{
4880
struct mmu_gather_batch *batch;
@@ -284,6 +316,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
284316
tlb->active = &tlb->local;
285317
tlb->batch_count = 0;
286318
#endif
319+
tlb->delayed_rmap = 0;
287320

288321
tlb_table_init(tlb);
289322
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE

0 commit comments

Comments
 (0)