Skip to content

Commit 9f141d6

Browse files
jankaradjbw
authored andcommitted
dax: Call ->iomap_begin without entry lock during dax fault
Currently ->iomap_begin() handler is called with entry lock held. If the filesystem held any locks between ->iomap_begin() and ->iomap_end() (such as ext4 which will want to hold transaction open), this would cause lock inversion with the iomap_apply() from standard IO path which first calls ->iomap_begin() and only then calls ->actor() callback which grabs entry locks for DAX (if it faults when copying from/to user provided buffers). Fix the problem by nesting grabbing of entry lock inside ->iomap_begin() - ->iomap_end() pair. Reviewed-by: Ross Zwisler <[email protected]> Signed-off-by: Jan Kara <[email protected]> Signed-off-by: Dan Williams <[email protected]>
1 parent f449b93 commit 9f141d6

File tree

1 file changed

+66
-55
lines changed

1 file changed

+66
-55
lines changed

fs/dax.c

Lines changed: 66 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
10781078
}
10791079
EXPORT_SYMBOL_GPL(dax_iomap_rw);
10801080

1081+
static int dax_fault_return(int error)
1082+
{
1083+
if (error == 0)
1084+
return VM_FAULT_NOPAGE;
1085+
if (error == -ENOMEM)
1086+
return VM_FAULT_OOM;
1087+
return VM_FAULT_SIGBUS;
1088+
}
1089+
10811090
/**
10821091
* dax_iomap_fault - handle a page fault on a DAX file
10831092
* @vma: The virtual memory area where the fault occurred
@@ -1110,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11101119
if (pos >= i_size_read(inode))
11111120
return VM_FAULT_SIGBUS;
11121121

1113-
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1114-
if (IS_ERR(entry)) {
1115-
error = PTR_ERR(entry);
1116-
goto out;
1117-
}
1118-
11191122
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
11201123
flags |= IOMAP_WRITE;
11211124

@@ -1126,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11261129
*/
11271130
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
11281131
if (error)
1129-
goto unlock_entry;
1132+
return dax_fault_return(error);
11301133
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1131-
error = -EIO; /* fs corruption? */
1134+
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
1135+
goto finish_iomap;
1136+
}
1137+
1138+
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1139+
if (IS_ERR(entry)) {
1140+
vmf_ret = dax_fault_return(PTR_ERR(entry));
11321141
goto finish_iomap;
11331142
}
11341143

@@ -1151,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11511160
}
11521161

11531162
if (error)
1154-
goto finish_iomap;
1163+
goto error_unlock_entry;
11551164

11561165
__SetPageUptodate(vmf->cow_page);
11571166
vmf_ret = finish_fault(vmf);
11581167
if (!vmf_ret)
11591168
vmf_ret = VM_FAULT_DONE_COW;
1160-
goto finish_iomap;
1169+
goto unlock_entry;
11611170
}
11621171

11631172
switch (iomap.type) {
@@ -1169,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11691178
}
11701179
error = dax_insert_mapping(mapping, iomap.bdev, sector,
11711180
PAGE_SIZE, &entry, vma, vmf);
1181+
/* -EBUSY is fine, somebody else faulted on the same PTE */
1182+
if (error == -EBUSY)
1183+
error = 0;
11721184
break;
11731185
case IOMAP_UNWRITTEN:
11741186
case IOMAP_HOLE:
11751187
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
11761188
vmf_ret = dax_load_hole(mapping, &entry, vmf);
1177-
goto finish_iomap;
1189+
goto unlock_entry;
11781190
}
11791191
/*FALLTHRU*/
11801192
default:
@@ -1183,30 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11831195
break;
11841196
}
11851197

1186-
finish_iomap:
1187-
if (ops->iomap_end) {
1188-
if (error || (vmf_ret & VM_FAULT_ERROR)) {
1189-
/* keep previous error */
1190-
ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
1191-
&iomap);
1192-
} else {
1193-
error = ops->iomap_end(inode, pos, PAGE_SIZE,
1194-
PAGE_SIZE, flags, &iomap);
1195-
}
1196-
}
1198+
error_unlock_entry:
1199+
vmf_ret = dax_fault_return(error) | major;
11971200
unlock_entry:
11981201
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1199-
out:
1200-
if (error == -ENOMEM)
1201-
return VM_FAULT_OOM | major;
1202-
/* -EBUSY is fine, somebody else faulted on the same PTE */
1203-
if (error < 0 && error != -EBUSY)
1204-
return VM_FAULT_SIGBUS | major;
1205-
if (vmf_ret) {
1206-
WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
1207-
return vmf_ret;
1202+
finish_iomap:
1203+
if (ops->iomap_end) {
1204+
int copied = PAGE_SIZE;
1205+
1206+
if (vmf_ret & VM_FAULT_ERROR)
1207+
copied = 0;
1208+
/*
1209+
* The fault is done by now and there's no way back (other
1210+
* thread may be already happily using PTE we have installed).
1211+
* Just ignore error from ->iomap_end since we cannot do much
1212+
* with it.
1213+
*/
1214+
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
12081215
}
1209-
return VM_FAULT_NOPAGE | major;
1216+
return vmf_ret;
12101217
}
12111218
EXPORT_SYMBOL_GPL(dax_iomap_fault);
12121219

@@ -1330,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13301337
if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
13311338
goto fallback;
13321339

1333-
/*
1334-
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1335-
* PMD or a HZP entry. If it can't (because a 4k page is already in
1336-
* the tree, for instance), it will return -EEXIST and we just fall
1337-
* back to 4k entries.
1338-
*/
1339-
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1340-
if (IS_ERR(entry))
1341-
goto fallback;
1342-
13431340
/*
13441341
* Note that we don't use iomap_apply here. We aren't doing I/O, only
13451342
* setting up a mapping, so really we're using iomap_begin() as a way
@@ -1348,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13481345
pos = (loff_t)pgoff << PAGE_SHIFT;
13491346
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
13501347
if (error)
1351-
goto unlock_entry;
1348+
goto fallback;
1349+
13521350
if (iomap.offset + iomap.length < pos + PMD_SIZE)
13531351
goto finish_iomap;
13541352

1353+
/*
1354+
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1355+
* PMD or a HZP entry. If it can't (because a 4k page is already in
1356+
* the tree, for instance), it will return -EEXIST and we just fall
1357+
* back to 4k entries.
1358+
*/
1359+
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1360+
if (IS_ERR(entry))
1361+
goto finish_iomap;
1362+
13551363
vmf.pgoff = pgoff;
13561364
vmf.flags = flags;
13571365
vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@@ -1364,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13641372
case IOMAP_UNWRITTEN:
13651373
case IOMAP_HOLE:
13661374
if (WARN_ON_ONCE(write))
1367-
goto finish_iomap;
1375+
goto unlock_entry;
13681376
result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
13691377
&entry);
13701378
break;
@@ -1373,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13731381
break;
13741382
}
13751383

1384+
unlock_entry:
1385+
put_locked_mapping_entry(mapping, pgoff, entry);
13761386
finish_iomap:
13771387
if (ops->iomap_end) {
1378-
if (result == VM_FAULT_FALLBACK) {
1379-
ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
1380-
&iomap);
1381-
} else {
1382-
error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
1383-
iomap_flags, &iomap);
1384-
if (error)
1385-
result = VM_FAULT_FALLBACK;
1386-
}
1388+
int copied = PMD_SIZE;
1389+
1390+
if (result == VM_FAULT_FALLBACK)
1391+
copied = 0;
1392+
/*
1393+
* The fault is done by now and there's no way back (other
1394+
* thread may be already happily using PMD we have installed).
1395+
* Just ignore error from ->iomap_end since we cannot do much
1396+
* with it.
1397+
*/
1398+
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1399+
&iomap);
13871400
}
1388-
unlock_entry:
1389-
put_locked_mapping_entry(mapping, pgoff, entry);
13901401
fallback:
13911402
if (result == VM_FAULT_FALLBACK) {
13921403
split_huge_pmd(vma, pmd, address);

0 commit comments

Comments
 (0)