Skip to content

Commit 4759d38

Browse files
committed
Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull DAX updates from Dan Williams: "The completion of Jan's DAX work for 4.10. As I mentioned in the libnvdimm-for-4.10 pull request, these are some final fixes for the DAX dirty-cacheline-tracking invalidation work that was merged through the -mm, ext4, and xfs trees in -rc1. These patches were prepared prior to the merge window, but we waited for 4.10-rc1 to have a stable merge base after all the prerequisites were merged. Quoting Jan on the overall changes in these patches: "So I'd like all these 6 patches to go for rc2. The first three patches fix invalidation of exceptional DAX entries (a bug which is there for a long time) - without these patches data loss can occur on power failure even though user called fsync(2). The other three patches change locking of DAX faults so that ->iomap_begin() is called in a more relaxed locking context and we are safe to start a transaction there for ext4" These have received a build success notification from the kbuild robot, and pass the latest libnvdimm unit tests. There have not been any -next releases since -rc1, so they have not appeared there" * 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: ext4: Simplify DAX fault path dax: Call ->iomap_begin without entry lock during dax fault dax: Finish fault completely when loading holes dax: Avoid page invalidation races and unnecessary radix tree traversals mm: Invalidate DAX radix tree entries only if appropriate ext2: Return BH_New buffers for zeroed blocks
2 parents 238d1d0 + 1db1754 commit 4759d38

File tree

5 files changed

+229
-143
lines changed

5 files changed

+229
-143
lines changed

fs/dax.c

Lines changed: 154 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -451,33 +451,84 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
451451
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
452452
}
453453

454+
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
455+
pgoff_t index, bool trunc)
456+
{
457+
int ret = 0;
458+
void *entry;
459+
struct radix_tree_root *page_tree = &mapping->page_tree;
460+
461+
spin_lock_irq(&mapping->tree_lock);
462+
entry = get_unlocked_mapping_entry(mapping, index, NULL);
463+
if (!entry || !radix_tree_exceptional_entry(entry))
464+
goto out;
465+
if (!trunc &&
466+
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
467+
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
468+
goto out;
469+
radix_tree_delete(page_tree, index);
470+
mapping->nrexceptional--;
471+
ret = 1;
472+
out:
473+
put_unlocked_mapping_entry(mapping, index, entry);
474+
spin_unlock_irq(&mapping->tree_lock);
475+
return ret;
476+
}
454477
/*
455478
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
456479
* entry to get unlocked before deleting it.
457480
*/
458481
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
459482
{
460-
void *entry;
483+
int ret = __dax_invalidate_mapping_entry(mapping, index, true);
461484

462-
spin_lock_irq(&mapping->tree_lock);
463-
entry = get_unlocked_mapping_entry(mapping, index, NULL);
464485
/*
465486
* This gets called from truncate / punch_hole path. As such, the caller
466487
* must hold locks protecting against concurrent modifications of the
467488
* radix tree (usually fs-private i_mmap_sem for writing). Since the
468489
* caller has seen exceptional entry for this index, we better find it
469490
* at that index as well...
470491
*/
471-
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
472-
spin_unlock_irq(&mapping->tree_lock);
473-
return 0;
474-
}
475-
radix_tree_delete(&mapping->page_tree, index);
492+
WARN_ON_ONCE(!ret);
493+
return ret;
494+
}
495+
496+
/*
497+
* Invalidate exceptional DAX entry if easily possible. This handles DAX
498+
* entries for invalidate_inode_pages() so we evict the entry only if we can
499+
* do so without blocking.
500+
*/
501+
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
502+
{
503+
int ret = 0;
504+
void *entry, **slot;
505+
struct radix_tree_root *page_tree = &mapping->page_tree;
506+
507+
spin_lock_irq(&mapping->tree_lock);
508+
entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
509+
if (!entry || !radix_tree_exceptional_entry(entry) ||
510+
slot_locked(mapping, slot))
511+
goto out;
512+
if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
513+
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
514+
goto out;
515+
radix_tree_delete(page_tree, index);
476516
mapping->nrexceptional--;
517+
ret = 1;
518+
out:
477519
spin_unlock_irq(&mapping->tree_lock);
478-
dax_wake_mapping_entry_waiter(mapping, index, entry, true);
520+
if (ret)
521+
dax_wake_mapping_entry_waiter(mapping, index, entry, true);
522+
return ret;
523+
}
479524

480-
return 1;
525+
/*
526+
* Invalidate exceptional DAX entry if it is clean.
527+
*/
528+
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
529+
pgoff_t index)
530+
{
531+
return __dax_invalidate_mapping_entry(mapping, index, false);
481532
}
482533

483534
/*
@@ -488,24 +539,34 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
488539
* otherwise it will simply fall out of the page cache under memory
489540
* pressure without ever having been dirtied.
490541
*/
491-
static int dax_load_hole(struct address_space *mapping, void *entry,
542+
static int dax_load_hole(struct address_space *mapping, void **entry,
492543
struct vm_fault *vmf)
493544
{
494545
struct page *page;
546+
int ret;
495547

496548
/* Hole page already exists? Return it... */
497-
if (!radix_tree_exceptional_entry(entry)) {
498-
vmf->page = entry;
499-
return VM_FAULT_LOCKED;
549+
if (!radix_tree_exceptional_entry(*entry)) {
550+
page = *entry;
551+
goto out;
500552
}
501553

502554
/* This will replace locked radix tree entry with a hole page */
503555
page = find_or_create_page(mapping, vmf->pgoff,
504556
vmf->gfp_mask | __GFP_ZERO);
505557
if (!page)
506558
return VM_FAULT_OOM;
559+
out:
507560
vmf->page = page;
508-
return VM_FAULT_LOCKED;
561+
ret = finish_fault(vmf);
562+
vmf->page = NULL;
563+
*entry = page;
564+
if (!ret) {
565+
/* Grab reference for PTE that is now referencing the page */
566+
get_page(page);
567+
return VM_FAULT_NOPAGE;
568+
}
569+
return ret;
509570
}
510571

511572
static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
934995
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
935996
return -EIO;
936997

998+
/*
999+
* Write can allocate block for an area which has a hole page mapped
1000+
* into page tables. We have to tear down these mappings so that data
1001+
* written by write(2) is visible in mmap.
1002+
*/
1003+
if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
1004+
invalidate_inode_pages2_range(inode->i_mapping,
1005+
pos >> PAGE_SHIFT,
1006+
(end - 1) >> PAGE_SHIFT);
1007+
}
1008+
9371009
while (pos < end) {
9381010
unsigned offset = pos & (PAGE_SIZE - 1);
9391011
struct blk_dax_ctl dax = { 0 };
@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
9921064
if (iov_iter_rw(iter) == WRITE)
9931065
flags |= IOMAP_WRITE;
9941066

995-
/*
996-
* Yes, even DAX files can have page cache attached to them: A zeroed
997-
* page is inserted into the pagecache when we have to serve a write
998-
* fault on a hole. It should never be dirtied and can simply be
999-
* dropped from the pagecache once we get real data for the page.
1000-
*
1001-
* XXX: This is racy against mmap, and there's nothing we can do about
1002-
* it. We'll eventually need to shift this down even further so that
1003-
* we can check if we allocated blocks over a hole first.
1004-
*/
1005-
if (mapping->nrpages) {
1006-
ret = invalidate_inode_pages2_range(mapping,
1007-
pos >> PAGE_SHIFT,
1008-
(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
1009-
WARN_ON_ONCE(ret);
1010-
}
1011-
10121067
while (iov_iter_count(iter)) {
10131068
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
10141069
iter, dax_iomap_actor);
@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
10231078
}
10241079
EXPORT_SYMBOL_GPL(dax_iomap_rw);
10251080

1081+
static int dax_fault_return(int error)
1082+
{
1083+
if (error == 0)
1084+
return VM_FAULT_NOPAGE;
1085+
if (error == -ENOMEM)
1086+
return VM_FAULT_OOM;
1087+
return VM_FAULT_SIGBUS;
1088+
}
1089+
10261090
/**
10271091
* dax_iomap_fault - handle a page fault on a DAX file
10281092
* @vma: The virtual memory area where the fault occurred
@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
10551119
if (pos >= i_size_read(inode))
10561120
return VM_FAULT_SIGBUS;
10571121

1058-
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1059-
if (IS_ERR(entry)) {
1060-
error = PTR_ERR(entry);
1061-
goto out;
1062-
}
1063-
10641122
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
10651123
flags |= IOMAP_WRITE;
10661124

@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
10711129
*/
10721130
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
10731131
if (error)
1074-
goto unlock_entry;
1132+
return dax_fault_return(error);
10751133
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1076-
error = -EIO; /* fs corruption? */
1134+
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
1135+
goto finish_iomap;
1136+
}
1137+
1138+
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1139+
if (IS_ERR(entry)) {
1140+
vmf_ret = dax_fault_return(PTR_ERR(entry));
10771141
goto finish_iomap;
10781142
}
10791143

@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
10961160
}
10971161

10981162
if (error)
1099-
goto finish_iomap;
1163+
goto error_unlock_entry;
11001164

11011165
__SetPageUptodate(vmf->cow_page);
11021166
vmf_ret = finish_fault(vmf);
11031167
if (!vmf_ret)
11041168
vmf_ret = VM_FAULT_DONE_COW;
1105-
goto finish_iomap;
1169+
goto unlock_entry;
11061170
}
11071171

11081172
switch (iomap.type) {
@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11141178
}
11151179
error = dax_insert_mapping(mapping, iomap.bdev, sector,
11161180
PAGE_SIZE, &entry, vma, vmf);
1181+
/* -EBUSY is fine, somebody else faulted on the same PTE */
1182+
if (error == -EBUSY)
1183+
error = 0;
11171184
break;
11181185
case IOMAP_UNWRITTEN:
11191186
case IOMAP_HOLE:
11201187
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1121-
vmf_ret = dax_load_hole(mapping, entry, vmf);
1122-
break;
1188+
vmf_ret = dax_load_hole(mapping, &entry, vmf);
1189+
goto unlock_entry;
11231190
}
11241191
/*FALLTHRU*/
11251192
default:
@@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11281195
break;
11291196
}
11301197

1198+
error_unlock_entry:
1199+
vmf_ret = dax_fault_return(error) | major;
1200+
unlock_entry:
1201+
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
11311202
finish_iomap:
11321203
if (ops->iomap_end) {
1133-
if (error || (vmf_ret & VM_FAULT_ERROR)) {
1134-
/* keep previous error */
1135-
ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
1136-
&iomap);
1137-
} else {
1138-
error = ops->iomap_end(inode, pos, PAGE_SIZE,
1139-
PAGE_SIZE, flags, &iomap);
1140-
}
1141-
}
1142-
unlock_entry:
1143-
if (vmf_ret != VM_FAULT_LOCKED || error)
1144-
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1145-
out:
1146-
if (error == -ENOMEM)
1147-
return VM_FAULT_OOM | major;
1148-
/* -EBUSY is fine, somebody else faulted on the same PTE */
1149-
if (error < 0 && error != -EBUSY)
1150-
return VM_FAULT_SIGBUS | major;
1151-
if (vmf_ret) {
1152-
WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
1153-
return vmf_ret;
1204+
int copied = PAGE_SIZE;
1205+
1206+
if (vmf_ret & VM_FAULT_ERROR)
1207+
copied = 0;
1208+
/*
1209+
* The fault is done by now and there's no way back (other
1210+
* thread may be already happily using PTE we have installed).
1211+
* Just ignore error from ->iomap_end since we cannot do much
1212+
* with it.
1213+
*/
1214+
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
11541215
}
1155-
return VM_FAULT_NOPAGE | major;
1216+
return vmf_ret;
11561217
}
11571218
EXPORT_SYMBOL_GPL(dax_iomap_fault);
11581219

@@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
12761337
if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
12771338
goto fallback;
12781339

1279-
/*
1280-
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1281-
* PMD or a HZP entry. If it can't (because a 4k page is already in
1282-
* the tree, for instance), it will return -EEXIST and we just fall
1283-
* back to 4k entries.
1284-
*/
1285-
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1286-
if (IS_ERR(entry))
1287-
goto fallback;
1288-
12891340
/*
12901341
* Note that we don't use iomap_apply here. We aren't doing I/O, only
12911342
* setting up a mapping, so really we're using iomap_begin() as a way
@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
12941345
pos = (loff_t)pgoff << PAGE_SHIFT;
12951346
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
12961347
if (error)
1297-
goto unlock_entry;
1348+
goto fallback;
1349+
12981350
if (iomap.offset + iomap.length < pos + PMD_SIZE)
12991351
goto finish_iomap;
13001352

1353+
/*
1354+
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1355+
* PMD or a HZP entry. If it can't (because a 4k page is already in
1356+
* the tree, for instance), it will return -EEXIST and we just fall
1357+
* back to 4k entries.
1358+
*/
1359+
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1360+
if (IS_ERR(entry))
1361+
goto finish_iomap;
1362+
13011363
vmf.pgoff = pgoff;
13021364
vmf.flags = flags;
13031365
vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13101372
case IOMAP_UNWRITTEN:
13111373
case IOMAP_HOLE:
13121374
if (WARN_ON_ONCE(write))
1313-
goto finish_iomap;
1375+
goto unlock_entry;
13141376
result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
13151377
&entry);
13161378
break;
@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13191381
break;
13201382
}
13211383

1384+
unlock_entry:
1385+
put_locked_mapping_entry(mapping, pgoff, entry);
13221386
finish_iomap:
13231387
if (ops->iomap_end) {
1324-
if (result == VM_FAULT_FALLBACK) {
1325-
ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
1326-
&iomap);
1327-
} else {
1328-
error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
1329-
iomap_flags, &iomap);
1330-
if (error)
1331-
result = VM_FAULT_FALLBACK;
1332-
}
1388+
int copied = PMD_SIZE;
1389+
1390+
if (result == VM_FAULT_FALLBACK)
1391+
copied = 0;
1392+
/*
1393+
* The fault is done by now and there's no way back (other
1394+
* thread may be already happily using PMD we have installed).
1395+
* Just ignore error from ->iomap_end since we cannot do much
1396+
* with it.
1397+
*/
1398+
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1399+
&iomap);
13331400
}
1334-
unlock_entry:
1335-
put_locked_mapping_entry(mapping, pgoff, entry);
13361401
fallback:
13371402
if (result == VM_FAULT_FALLBACK) {
13381403
split_huge_pmd(vma, pmd, address);

0 commit comments

Comments
 (0)