Skip to content

Commit 2e7ce7d

Browse files
howlettakpm00
authored andcommitted
mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()
Avoid allocating a new VMA when it a vma modification can occur. When a brk() can expand or contract a VMA, then the single store operation will only modify one index of the maple tree instead of causing a node to split or coalesce. This avoids unnecessary allocations/frees of maple tree nodes and VMAs. Move some limit & flag verifications out of the do_brk_flags() function to use only relevant checks in the code path of bkr() and vm_brk_flags(). Set the vma to check if it can expand in vm_brk_flags() if extra criteria are met. Drop userfaultfd from do_brk_flags() path and only use it in vm_brk_flags() path since that is the only place a munmap will happen. Add a wraper for munmap for the brk case called do_brk_munmap(). Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Liam R. Howlett <[email protected]> Tested-by: Yu Zhao <[email protected]> Cc: Catalin Marinas <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: David Howells <[email protected]> Cc: Davidlohr Bueso <[email protected]> Cc: "Matthew Wilcox (Oracle)" <[email protected]> Cc: SeongJae Park <[email protected]> Cc: Sven Schnelle <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Will Deacon <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 94d815b commit 2e7ce7d

File tree

1 file changed

+177
-60
lines changed

1 file changed

+177
-60
lines changed

mm/mmap.c

Lines changed: 177 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
147147
return next;
148148
}
149149

150-
static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
151-
struct list_head *uf);
150+
/*
151+
* check_brk_limits() - Use platform specific check of range & verify mlock
152+
* limits.
153+
* @addr: The address to check
154+
* @len: The size of increase.
155+
*
156+
* Return: 0 on success.
157+
*/
158+
static int check_brk_limits(unsigned long addr, unsigned long len)
159+
{
160+
unsigned long mapped_addr;
161+
162+
mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
163+
if (IS_ERR_VALUE(mapped_addr))
164+
return mapped_addr;
165+
166+
return mlock_future_check(current->mm, current->mm->def_flags, len);
167+
}
168+
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
169+
unsigned long newbrk, unsigned long oldbrk,
170+
struct list_head *uf);
171+
static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
172+
unsigned long addr, unsigned long request,
173+
unsigned long flags);
152174
SYSCALL_DEFINE1(brk, unsigned long, brk)
153175
{
154176
unsigned long newbrk, oldbrk, origbrk;
155177
struct mm_struct *mm = current->mm;
156-
struct vm_area_struct *next;
178+
struct vm_area_struct *brkvma, *next = NULL;
157179
unsigned long min_brk;
158180
bool populate;
159181
bool downgraded = false;
160182
LIST_HEAD(uf);
183+
MA_STATE(mas, &mm->mm_mt, 0, 0);
161184

162185
if (mmap_write_lock_killable(mm))
163186
return -EINTR;
@@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
199222

200223
/*
201224
* Always allow shrinking brk.
202-
* __do_munmap() may downgrade mmap_lock to read.
225+
* do_brk_munmap() may downgrade mmap_lock to read.
203226
*/
204227
if (brk <= mm->brk) {
205228
int ret;
206229

230+
/* Search one past newbrk */
231+
mas_set(&mas, newbrk);
232+
brkvma = mas_find(&mas, oldbrk);
233+
BUG_ON(brkvma == NULL);
234+
if (brkvma->vm_start >= oldbrk)
235+
goto out; /* mapping intersects with an existing non-brk vma. */
207236
/*
208-
* mm->brk must to be protected by write mmap_lock so update it
209-
* before downgrading mmap_lock. When __do_munmap() fails,
210-
* mm->brk will be restored from origbrk.
237+
* mm->brk must be protected by write mmap_lock.
238+
* do_brk_munmap() may downgrade the lock, so update it
239+
* before calling do_brk_munmap().
211240
*/
212241
mm->brk = brk;
213-
ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
214-
if (ret < 0) {
215-
mm->brk = origbrk;
216-
goto out;
217-
} else if (ret == 1) {
242+
mas.last = oldbrk - 1;
243+
ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
244+
if (ret == 1) {
218245
downgraded = true;
219-
}
220-
goto success;
246+
goto success;
247+
} else if (!ret)
248+
goto success;
249+
250+
mm->brk = origbrk;
251+
goto out;
221252
}
222253

223-
/* Check against existing mmap mappings. */
224-
next = find_vma(mm, oldbrk);
254+
if (check_brk_limits(oldbrk, newbrk - oldbrk))
255+
goto out;
256+
257+
/*
258+
* Only check if the next VMA is within the stack_guard_gap of the
259+
* expansion area
260+
*/
261+
mas_set(&mas, oldbrk);
262+
next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
225263
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
226264
goto out;
227265

266+
brkvma = mas_prev(&mas, mm->start_brk);
228267
/* Ok, looks good - let it rip. */
229-
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
268+
if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
230269
goto out;
270+
231271
mm->brk = brk;
232272

233273
success:
@@ -2762,38 +2802,55 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
27622802
}
27632803

27642804
/*
2765-
* this is really a simplified "do_mmap". it only handles
2766-
* anonymous maps. eventually we may be able to do some
2767-
* brk-specific accounting here.
2805+
* brk_munmap() - Unmap a parital vma.
2806+
* @mas: The maple tree state.
2807+
* @vma: The vma to be modified
2808+
* @newbrk: the start of the address to unmap
2809+
* @oldbrk: The end of the address to unmap
2810+
* @uf: The userfaultfd list_head
2811+
*
2812+
* Returns: 1 on success.
2813+
* unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
2814+
* possible.
27682815
*/
2769-
static int do_brk_flags(unsigned long addr, unsigned long len,
2770-
unsigned long flags, struct list_head *uf)
2816+
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
2817+
unsigned long newbrk, unsigned long oldbrk,
2818+
struct list_head *uf)
27712819
{
2772-
struct mm_struct *mm = current->mm;
2773-
struct vm_area_struct *vma, *prev;
2774-
pgoff_t pgoff = addr >> PAGE_SHIFT;
2775-
int error;
2776-
unsigned long mapped_addr;
2777-
validate_mm_mt(mm);
2778-
2779-
/* Until we need other flags, refuse anything except VM_EXEC. */
2780-
if ((flags & (~VM_EXEC)) != 0)
2781-
return -EINVAL;
2782-
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2783-
2784-
mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2785-
if (IS_ERR_VALUE(mapped_addr))
2786-
return mapped_addr;
2820+
struct mm_struct *mm = vma->vm_mm;
2821+
int ret;
27872822

2788-
error = mlock_future_check(mm, mm->def_flags, len);
2789-
if (error)
2790-
return error;
2823+
arch_unmap(mm, newbrk, oldbrk);
2824+
ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true);
2825+
validate_mm_mt(mm);
2826+
return ret;
2827+
}
27912828

2792-
/* Clear old maps, set up prev and uf */
2793-
if (munmap_vma_range(mm, addr, len, &prev, uf))
2794-
return -ENOMEM;
2829+
/*
2830+
* do_brk_flags() - Increase the brk vma if the flags match.
2831+
* @mas: The maple tree state.
2832+
* @addr: The start address
2833+
* @len: The length of the increase
2834+
* @vma: The vma,
2835+
* @flags: The VMA Flags
2836+
*
2837+
* Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
2838+
* do not match then create a new anonymous VMA. Eventually we may be able to
2839+
* do some brk-specific accounting here.
2840+
*/
2841+
static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
2842+
unsigned long addr, unsigned long len,
2843+
unsigned long flags)
2844+
{
2845+
struct mm_struct *mm = current->mm;
2846+
struct vm_area_struct *prev = NULL;
27952847

2796-
/* Check against address space limits *after* clearing old maps... */
2848+
validate_mm_mt(mm);
2849+
/*
2850+
* Check against address space limits by the changed size
2851+
* Note: This happens *after* clearing old mappings in some code paths.
2852+
*/
2853+
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
27972854
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
27982855
return -ENOMEM;
27992856

@@ -2803,30 +2860,54 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
28032860
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
28042861
return -ENOMEM;
28052862

2806-
/* Can we just expand an old private anonymous mapping? */
2807-
vma = vma_merge(mm, prev, addr, addr + len, flags,
2808-
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
2809-
if (vma)
2810-
goto out;
2811-
28122863
/*
2813-
* create a vma struct for an anonymous mapping
2864+
* Expand the existing vma if possible; Note that singular lists do not
2865+
* occur after forking, so the expand will only happen on new VMAs.
28142866
*/
2815-
vma = vm_area_alloc(mm);
2816-
if (!vma) {
2817-
vm_unacct_memory(len >> PAGE_SHIFT);
2818-
return -ENOMEM;
2867+
if (vma &&
2868+
(!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
2869+
((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
2870+
mas->index = vma->vm_start;
2871+
mas->last = addr + len - 1;
2872+
vma_adjust_trans_huge(vma, addr, addr + len, 0);
2873+
if (vma->anon_vma) {
2874+
anon_vma_lock_write(vma->anon_vma);
2875+
anon_vma_interval_tree_pre_update_vma(vma);
2876+
}
2877+
vma->vm_end = addr + len;
2878+
vma->vm_flags |= VM_SOFTDIRTY;
2879+
if (mas_store_gfp(mas, vma, GFP_KERNEL))
2880+
goto mas_expand_failed;
2881+
2882+
if (vma->anon_vma) {
2883+
anon_vma_interval_tree_post_update_vma(vma);
2884+
anon_vma_unlock_write(vma->anon_vma);
2885+
}
2886+
khugepaged_enter_vma(vma, flags);
2887+
goto out;
28192888
}
2889+
prev = vma;
2890+
2891+
/* create a vma struct for an anonymous mapping */
2892+
vma = vm_area_alloc(mm);
2893+
if (!vma)
2894+
goto vma_alloc_fail;
28202895

28212896
vma_set_anonymous(vma);
28222897
vma->vm_start = addr;
28232898
vma->vm_end = addr + len;
2824-
vma->vm_pgoff = pgoff;
2899+
vma->vm_pgoff = addr >> PAGE_SHIFT;
28252900
vma->vm_flags = flags;
28262901
vma->vm_page_prot = vm_get_page_prot(flags);
2827-
if (vma_link(mm, vma, prev))
2828-
goto no_vma_link;
2902+
mas_set_range(mas, vma->vm_start, addr + len - 1);
2903+
if (mas_store_gfp(mas, vma, GFP_KERNEL))
2904+
goto mas_store_fail;
2905+
2906+
if (!prev)
2907+
prev = mas_prev(mas, 0);
28292908

2909+
__vma_link_list(mm, vma, prev);
2910+
mm->map_count++;
28302911
out:
28312912
perf_event_mmap(vma);
28322913
mm->total_vm += len >> PAGE_SHIFT;
@@ -2837,18 +2918,29 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
28372918
validate_mm_mt(mm);
28382919
return 0;
28392920

2840-
no_vma_link:
2921+
mas_store_fail:
28412922
vm_area_free(vma);
2923+
vma_alloc_fail:
2924+
vm_unacct_memory(len >> PAGE_SHIFT);
2925+
return -ENOMEM;
2926+
2927+
mas_expand_failed:
2928+
if (vma->anon_vma) {
2929+
anon_vma_interval_tree_post_update_vma(vma);
2930+
anon_vma_unlock_write(vma->anon_vma);
2931+
}
28422932
return -ENOMEM;
28432933
}
28442934

28452935
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
28462936
{
28472937
struct mm_struct *mm = current->mm;
2938+
struct vm_area_struct *vma = NULL;
28482939
unsigned long len;
28492940
int ret;
28502941
bool populate;
28512942
LIST_HEAD(uf);
2943+
MA_STATE(mas, &mm->mm_mt, addr, addr);
28522944

28532945
len = PAGE_ALIGN(request);
28542946
if (len < request)
@@ -2859,13 +2951,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
28592951
if (mmap_write_lock_killable(mm))
28602952
return -EINTR;
28612953

2862-
ret = do_brk_flags(addr, len, flags, &uf);
2954+
/* Until we need other flags, refuse anything except VM_EXEC. */
2955+
if ((flags & (~VM_EXEC)) != 0)
2956+
return -EINVAL;
2957+
2958+
ret = check_brk_limits(addr, len);
2959+
if (ret)
2960+
goto limits_failed;
2961+
2962+
if (find_vma_intersection(mm, addr, addr + len))
2963+
ret = do_munmap(mm, addr, len, &uf);
2964+
2965+
if (ret)
2966+
goto munmap_failed;
2967+
2968+
vma = mas_prev(&mas, 0);
2969+
if (!vma || vma->vm_end != addr || vma_policy(vma) ||
2970+
!can_vma_merge_after(vma, flags, NULL, NULL,
2971+
addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL))
2972+
vma = NULL;
2973+
2974+
ret = do_brk_flags(&mas, vma, addr, len, flags);
28632975
populate = ((mm->def_flags & VM_LOCKED) != 0);
28642976
mmap_write_unlock(mm);
28652977
userfaultfd_unmap_complete(mm, &uf);
28662978
if (populate && !ret)
28672979
mm_populate(addr, len);
28682980
return ret;
2981+
2982+
munmap_failed:
2983+
limits_failed:
2984+
mmap_write_unlock(mm);
2985+
return ret;
28692986
}
28702987
EXPORT_SYMBOL(vm_brk_flags);
28712988

0 commit comments

Comments
 (0)