83
83
#include <linux/init.h>
84
84
#include <linux/compat.h>
85
85
#include <linux/mempolicy.h>
86
+ #include <linux/swap.h>
87
+
86
88
#include <asm/tlbflush.h>
87
89
#include <asm/uaccess.h>
88
90
91
+ /* Internal MPOL_MF_xxx flags */
92
+ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
93
+
89
94
static kmem_cache_t * policy_cache ;
90
95
static kmem_cache_t * sn_cache ;
91
96
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
174
179
return policy ;
175
180
}
176
181
182
+ /* Check if we are the only process mapping the page in question */
183
+ static inline int single_mm_mapping (struct mm_struct * mm ,
184
+ struct address_space * mapping )
185
+ {
186
+ struct vm_area_struct * vma ;
187
+ struct prio_tree_iter iter ;
188
+ int rc = 1 ;
189
+
190
+ spin_lock (& mapping -> i_mmap_lock );
191
+ vma_prio_tree_foreach (vma , & iter , & mapping -> i_mmap , 0 , ULONG_MAX )
192
+ if (mm != vma -> vm_mm ) {
193
+ rc = 0 ;
194
+ goto out ;
195
+ }
196
+ list_for_each_entry (vma , & mapping -> i_mmap_nonlinear , shared .vm_set .list )
197
+ if (mm != vma -> vm_mm ) {
198
+ rc = 0 ;
199
+ goto out ;
200
+ }
201
+ out :
202
+ spin_unlock (& mapping -> i_mmap_lock );
203
+ return rc ;
204
+ }
205
+
206
+ /*
207
+ * Add a page to be migrated to the pagelist
208
+ */
209
+ static void migrate_page_add (struct vm_area_struct * vma ,
210
+ struct page * page , struct list_head * pagelist , unsigned long flags )
211
+ {
212
+ /*
213
+ * Avoid migrating a page that is shared by others and not writable.
214
+ */
215
+ if ((flags & MPOL_MF_MOVE_ALL ) || !page -> mapping || PageAnon (page ) ||
216
+ mapping_writably_mapped (page -> mapping ) ||
217
+ single_mm_mapping (vma -> vm_mm , page -> mapping )) {
218
+ int rc = isolate_lru_page (page );
219
+
220
+ if (rc == 1 )
221
+ list_add (& page -> lru , pagelist );
222
+ /*
223
+ * If the isolate attempt was not successful then we just
224
+ * encountered an unswappable page. Something must be wrong.
225
+ */
226
+ WARN_ON (rc == 0 );
227
+ }
228
+ }
229
+
177
230
/* Ensure all existing pages follow the policy. */
178
231
static int check_pte_range (struct vm_area_struct * vma , pmd_t * pmd ,
179
- unsigned long addr , unsigned long end , nodemask_t * nodes )
232
+ unsigned long addr , unsigned long end ,
233
+ const nodemask_t * nodes , unsigned long flags ,
234
+ struct list_head * pagelist )
180
235
{
181
236
pte_t * orig_pte ;
182
237
pte_t * pte ;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
193
248
if (!page )
194
249
continue ;
195
250
nid = page_to_nid (page );
196
- if (!node_isset (nid , * nodes ))
197
- break ;
251
+ if (!node_isset (nid , * nodes )) {
252
+ if (pagelist )
253
+ migrate_page_add (vma , page , pagelist , flags );
254
+ else
255
+ break ;
256
+ }
198
257
} while (pte ++ , addr += PAGE_SIZE , addr != end );
199
258
pte_unmap_unlock (orig_pte , ptl );
200
259
return addr != end ;
201
260
}
202
261
203
262
static inline int check_pmd_range (struct vm_area_struct * vma , pud_t * pud ,
204
- unsigned long addr , unsigned long end , nodemask_t * nodes )
263
+ unsigned long addr , unsigned long end ,
264
+ const nodemask_t * nodes , unsigned long flags ,
265
+ struct list_head * pagelist )
205
266
{
206
267
pmd_t * pmd ;
207
268
unsigned long next ;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211
272
next = pmd_addr_end (addr , end );
212
273
if (pmd_none_or_clear_bad (pmd ))
213
274
continue ;
214
- if (check_pte_range (vma , pmd , addr , next , nodes ))
275
+ if (check_pte_range (vma , pmd , addr , next , nodes ,
276
+ flags , pagelist ))
215
277
return - EIO ;
216
278
} while (pmd ++ , addr = next , addr != end );
217
279
return 0 ;
218
280
}
219
281
220
282
static inline int check_pud_range (struct vm_area_struct * vma , pgd_t * pgd ,
221
- unsigned long addr , unsigned long end , nodemask_t * nodes )
283
+ unsigned long addr , unsigned long end ,
284
+ const nodemask_t * nodes , unsigned long flags ,
285
+ struct list_head * pagelist )
222
286
{
223
287
pud_t * pud ;
224
288
unsigned long next ;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228
292
next = pud_addr_end (addr , end );
229
293
if (pud_none_or_clear_bad (pud ))
230
294
continue ;
231
- if (check_pmd_range (vma , pud , addr , next , nodes ))
295
+ if (check_pmd_range (vma , pud , addr , next , nodes ,
296
+ flags , pagelist ))
232
297
return - EIO ;
233
298
} while (pud ++ , addr = next , addr != end );
234
299
return 0 ;
235
300
}
236
301
237
302
static inline int check_pgd_range (struct vm_area_struct * vma ,
238
- unsigned long addr , unsigned long end , nodemask_t * nodes )
303
+ unsigned long addr , unsigned long end ,
304
+ const nodemask_t * nodes , unsigned long flags ,
305
+ struct list_head * pagelist )
239
306
{
240
307
pgd_t * pgd ;
241
308
unsigned long next ;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
245
312
next = pgd_addr_end (addr , end );
246
313
if (pgd_none_or_clear_bad (pgd ))
247
314
continue ;
248
- if (check_pud_range (vma , pgd , addr , next , nodes ))
315
+ if (check_pud_range (vma , pgd , addr , next , nodes ,
316
+ flags , pagelist ))
249
317
return - EIO ;
250
318
} while (pgd ++ , addr = next , addr != end );
251
319
return 0 ;
252
320
}
253
321
254
- /* Step 1: check the range */
322
+ /* Check if a vma is migratable */
323
+ static inline int vma_migratable (struct vm_area_struct * vma )
324
+ {
325
+ if (vma -> vm_flags & (
326
+ VM_LOCKED |VM_IO |VM_HUGETLB |VM_PFNMAP ))
327
+ return 0 ;
328
+ return 1 ;
329
+ }
330
+
331
+ /*
332
+ * Check if all pages in a range are on a set of nodes.
333
+ * If pagelist != NULL then isolate pages from the LRU and
334
+ * put them on the pagelist.
335
+ */
255
336
static struct vm_area_struct *
256
337
check_range (struct mm_struct * mm , unsigned long start , unsigned long end ,
257
- nodemask_t * nodes , unsigned long flags )
338
+ const nodemask_t * nodes , unsigned long flags ,
339
+ struct list_head * pagelist )
258
340
{
259
341
int err ;
260
342
struct vm_area_struct * first , * vma , * prev ;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264
346
return ERR_PTR (- EFAULT );
265
347
prev = NULL ;
266
348
for (vma = first ; vma && vma -> vm_start < end ; vma = vma -> vm_next ) {
267
- if (!vma -> vm_next && vma -> vm_end < end )
268
- return ERR_PTR (- EFAULT );
269
- if (prev && prev -> vm_end < vma -> vm_start )
270
- return ERR_PTR (- EFAULT );
271
- if ((flags & MPOL_MF_STRICT ) && !is_vm_hugetlb_page (vma )) {
349
+ if (!(flags & MPOL_MF_DISCONTIG_OK )) {
350
+ if (!vma -> vm_next && vma -> vm_end < end )
351
+ return ERR_PTR (- EFAULT );
352
+ if (prev && prev -> vm_end < vma -> vm_start )
353
+ return ERR_PTR (- EFAULT );
354
+ }
355
+ if (!is_vm_hugetlb_page (vma ) &&
356
+ ((flags & MPOL_MF_STRICT ) ||
357
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL )) &&
358
+ vma_migratable (vma )))) {
272
359
unsigned long endvma = vma -> vm_end ;
360
+
273
361
if (endvma > end )
274
362
endvma = end ;
275
363
if (vma -> vm_start > start )
276
364
start = vma -> vm_start ;
277
- err = check_pgd_range (vma , start , endvma , nodes );
365
+ err = check_pgd_range (vma , start , endvma , nodes ,
366
+ flags , pagelist );
278
367
if (err ) {
279
368
first = ERR_PTR (err );
280
369
break ;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
348
437
struct mempolicy * new ;
349
438
unsigned long end ;
350
439
int err ;
440
+ LIST_HEAD (pagelist );
351
441
352
- if ((flags & ~(unsigned long )(MPOL_MF_STRICT )) || mode > MPOL_MAX )
442
+ if ((flags & ~(unsigned long )(MPOL_MF_STRICT |MPOL_MF_MOVE |MPOL_MF_MOVE_ALL ))
443
+ || mode > MPOL_MAX )
353
444
return - EINVAL ;
445
+ if ((flags & MPOL_MF_MOVE_ALL ) && !capable (CAP_SYS_RESOURCE ))
446
+ return - EPERM ;
447
+
354
448
if (start & ~PAGE_MASK )
355
449
return - EINVAL ;
450
+
356
451
if (mode == MPOL_DEFAULT )
357
452
flags &= ~MPOL_MF_STRICT ;
453
+
358
454
len = (len + PAGE_SIZE - 1 ) & PAGE_MASK ;
359
455
end = start + len ;
456
+
360
457
if (end < start )
361
458
return - EINVAL ;
362
459
if (end == start )
363
460
return 0 ;
461
+
364
462
if (mpol_check_policy (mode , nmask ))
365
463
return - EINVAL ;
464
+
366
465
new = mpol_new (mode , nmask );
367
466
if (IS_ERR (new ))
368
467
return PTR_ERR (new );
369
468
469
+ /*
470
+ * If we are using the default policy then operation
471
+ * on discontinuous address spaces is okay after all
472
+ */
473
+ if (!new )
474
+ flags |= MPOL_MF_DISCONTIG_OK ;
475
+
370
476
PDprintk ("mbind %lx-%lx mode:%ld nodes:%lx\n" ,start ,start + len ,
371
477
mode ,nodes_addr (nodes )[0 ]);
372
478
373
479
down_write (& mm -> mmap_sem );
374
- vma = check_range (mm , start , end , nmask , flags );
480
+ vma = check_range (mm , start , end , nmask , flags ,
481
+ (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL )) ? & pagelist : NULL );
375
482
err = PTR_ERR (vma );
376
- if (!IS_ERR (vma ))
483
+ if (!IS_ERR (vma )) {
377
484
err = mbind_range (vma , start , end , new );
485
+ if (!list_empty (& pagelist ))
486
+ migrate_pages (& pagelist , NULL );
487
+ if (!err && !list_empty (& pagelist ) && (flags & MPOL_MF_STRICT ))
488
+ err = - EIO ;
489
+ }
490
+ if (!list_empty (& pagelist ))
491
+ putback_lru_pages (& pagelist );
492
+
378
493
up_write (& mm -> mmap_sem );
379
494
mpol_free (new );
380
495
return err ;
0 commit comments