Skip to content

Commit dc9aa5b

Browse files
Christoph LameterLinus Torvalds
authored andcommitted
[PATCH] Swap Migration V5: MPOL_MF_MOVE interface
Add page migration support via swap to the NUMA policy layer This patch adds page migration support to the NUMA policy layer. An additional flag MPOL_MF_MOVE is introduced for mbind. If MPOL_MF_MOVE is specified then pages that do not conform to the memory policy will be evicted from memory. When they get pages back in new pages will be allocated following the numa policy. Signed-off-by: Christoph Lameter <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 7cbe34c commit dc9aa5b

File tree

2 files changed

+138
-20
lines changed

2 files changed

+138
-20
lines changed

include/linux/mempolicy.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222

2323
/* Flags for mbind */
2424
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
25+
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
26+
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
27+
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
2528

2629
#ifdef __KERNEL__
2730

mm/mempolicy.c

Lines changed: 135 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,14 @@
8383
#include <linux/init.h>
8484
#include <linux/compat.h>
8585
#include <linux/mempolicy.h>
86+
#include <linux/swap.h>
87+
8688
#include <asm/tlbflush.h>
8789
#include <asm/uaccess.h>
8890

91+
/* Internal MPOL_MF_xxx flags */
92+
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
93+
8994
static kmem_cache_t *policy_cache;
9095
static kmem_cache_t *sn_cache;
9196

@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
174179
return policy;
175180
}
176181

182+
/* Check if we are the only process mapping the page in question */
183+
static inline int single_mm_mapping(struct mm_struct *mm,
184+
struct address_space *mapping)
185+
{
186+
struct vm_area_struct *vma;
187+
struct prio_tree_iter iter;
188+
int rc = 1;
189+
190+
spin_lock(&mapping->i_mmap_lock);
191+
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
192+
if (mm != vma->vm_mm) {
193+
rc = 0;
194+
goto out;
195+
}
196+
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
197+
if (mm != vma->vm_mm) {
198+
rc = 0;
199+
goto out;
200+
}
201+
out:
202+
spin_unlock(&mapping->i_mmap_lock);
203+
return rc;
204+
}
205+
206+
/*
207+
* Add a page to be migrated to the pagelist
208+
*/
209+
static void migrate_page_add(struct vm_area_struct *vma,
210+
struct page *page, struct list_head *pagelist, unsigned long flags)
211+
{
212+
/*
213+
* Avoid migrating a page that is shared by others and not writable.
214+
*/
215+
if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
216+
mapping_writably_mapped(page->mapping) ||
217+
single_mm_mapping(vma->vm_mm, page->mapping)) {
218+
int rc = isolate_lru_page(page);
219+
220+
if (rc == 1)
221+
list_add(&page->lru, pagelist);
222+
/*
223+
* If the isolate attempt was not successful then we just
224+
* encountered an unswappable page. Something must be wrong.
225+
*/
226+
WARN_ON(rc == 0);
227+
}
228+
}
229+
177230
/* Ensure all existing pages follow the policy. */
178231
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
179-
unsigned long addr, unsigned long end, nodemask_t *nodes)
232+
unsigned long addr, unsigned long end,
233+
const nodemask_t *nodes, unsigned long flags,
234+
struct list_head *pagelist)
180235
{
181236
pte_t *orig_pte;
182237
pte_t *pte;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
193248
if (!page)
194249
continue;
195250
nid = page_to_nid(page);
196-
if (!node_isset(nid, *nodes))
197-
break;
251+
if (!node_isset(nid, *nodes)) {
252+
if (pagelist)
253+
migrate_page_add(vma, page, pagelist, flags);
254+
else
255+
break;
256+
}
198257
} while (pte++, addr += PAGE_SIZE, addr != end);
199258
pte_unmap_unlock(orig_pte, ptl);
200259
return addr != end;
201260
}
202261

203262
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
204-
unsigned long addr, unsigned long end, nodemask_t *nodes)
263+
unsigned long addr, unsigned long end,
264+
const nodemask_t *nodes, unsigned long flags,
265+
struct list_head *pagelist)
205266
{
206267
pmd_t *pmd;
207268
unsigned long next;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211272
next = pmd_addr_end(addr, end);
212273
if (pmd_none_or_clear_bad(pmd))
213274
continue;
214-
if (check_pte_range(vma, pmd, addr, next, nodes))
275+
if (check_pte_range(vma, pmd, addr, next, nodes,
276+
flags, pagelist))
215277
return -EIO;
216278
} while (pmd++, addr = next, addr != end);
217279
return 0;
218280
}
219281

220282
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
221-
unsigned long addr, unsigned long end, nodemask_t *nodes)
283+
unsigned long addr, unsigned long end,
284+
const nodemask_t *nodes, unsigned long flags,
285+
struct list_head *pagelist)
222286
{
223287
pud_t *pud;
224288
unsigned long next;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228292
next = pud_addr_end(addr, end);
229293
if (pud_none_or_clear_bad(pud))
230294
continue;
231-
if (check_pmd_range(vma, pud, addr, next, nodes))
295+
if (check_pmd_range(vma, pud, addr, next, nodes,
296+
flags, pagelist))
232297
return -EIO;
233298
} while (pud++, addr = next, addr != end);
234299
return 0;
235300
}
236301

237302
static inline int check_pgd_range(struct vm_area_struct *vma,
238-
unsigned long addr, unsigned long end, nodemask_t *nodes)
303+
unsigned long addr, unsigned long end,
304+
const nodemask_t *nodes, unsigned long flags,
305+
struct list_head *pagelist)
239306
{
240307
pgd_t *pgd;
241308
unsigned long next;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
245312
next = pgd_addr_end(addr, end);
246313
if (pgd_none_or_clear_bad(pgd))
247314
continue;
248-
if (check_pud_range(vma, pgd, addr, next, nodes))
315+
if (check_pud_range(vma, pgd, addr, next, nodes,
316+
flags, pagelist))
249317
return -EIO;
250318
} while (pgd++, addr = next, addr != end);
251319
return 0;
252320
}
253321

254-
/* Step 1: check the range */
322+
/* Check if a vma is migratable */
323+
static inline int vma_migratable(struct vm_area_struct *vma)
324+
{
325+
if (vma->vm_flags & (
326+
VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
327+
return 0;
328+
return 1;
329+
}
330+
331+
/*
332+
* Check if all pages in a range are on a set of nodes.
333+
* If pagelist != NULL then isolate pages from the LRU and
334+
* put them on the pagelist.
335+
*/
255336
static struct vm_area_struct *
256337
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
257-
nodemask_t *nodes, unsigned long flags)
338+
const nodemask_t *nodes, unsigned long flags,
339+
struct list_head *pagelist)
258340
{
259341
int err;
260342
struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264346
return ERR_PTR(-EFAULT);
265347
prev = NULL;
266348
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
267-
if (!vma->vm_next && vma->vm_end < end)
268-
return ERR_PTR(-EFAULT);
269-
if (prev && prev->vm_end < vma->vm_start)
270-
return ERR_PTR(-EFAULT);
271-
if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
349+
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
350+
if (!vma->vm_next && vma->vm_end < end)
351+
return ERR_PTR(-EFAULT);
352+
if (prev && prev->vm_end < vma->vm_start)
353+
return ERR_PTR(-EFAULT);
354+
}
355+
if (!is_vm_hugetlb_page(vma) &&
356+
((flags & MPOL_MF_STRICT) ||
357+
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
358+
vma_migratable(vma)))) {
272359
unsigned long endvma = vma->vm_end;
360+
273361
if (endvma > end)
274362
endvma = end;
275363
if (vma->vm_start > start)
276364
start = vma->vm_start;
277-
err = check_pgd_range(vma, start, endvma, nodes);
365+
err = check_pgd_range(vma, start, endvma, nodes,
366+
flags, pagelist);
278367
if (err) {
279368
first = ERR_PTR(err);
280369
break;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
348437
struct mempolicy *new;
349438
unsigned long end;
350439
int err;
440+
LIST_HEAD(pagelist);
351441

352-
if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
442+
if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
443+
|| mode > MPOL_MAX)
353444
return -EINVAL;
445+
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
446+
return -EPERM;
447+
354448
if (start & ~PAGE_MASK)
355449
return -EINVAL;
450+
356451
if (mode == MPOL_DEFAULT)
357452
flags &= ~MPOL_MF_STRICT;
453+
358454
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
359455
end = start + len;
456+
360457
if (end < start)
361458
return -EINVAL;
362459
if (end == start)
363460
return 0;
461+
364462
if (mpol_check_policy(mode, nmask))
365463
return -EINVAL;
464+
366465
new = mpol_new(mode, nmask);
367466
if (IS_ERR(new))
368467
return PTR_ERR(new);
369468

469+
/*
470+
* If we are using the default policy then operation
471+
* on discontinuous address spaces is okay after all
472+
*/
473+
if (!new)
474+
flags |= MPOL_MF_DISCONTIG_OK;
475+
370476
PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
371477
mode,nodes_addr(nodes)[0]);
372478

373479
down_write(&mm->mmap_sem);
374-
vma = check_range(mm, start, end, nmask, flags);
480+
vma = check_range(mm, start, end, nmask, flags,
481+
(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
375482
err = PTR_ERR(vma);
376-
if (!IS_ERR(vma))
483+
if (!IS_ERR(vma)) {
377484
err = mbind_range(vma, start, end, new);
485+
if (!list_empty(&pagelist))
486+
migrate_pages(&pagelist, NULL);
487+
if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
488+
err = -EIO;
489+
}
490+
if (!list_empty(&pagelist))
491+
putback_lru_pages(&pagelist);
492+
378493
up_write(&mm->mmap_sem);
379494
mpol_free(new);
380495
return err;

0 commit comments

Comments
 (0)