@@ -451,33 +451,84 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
451
451
__wake_up (wq , TASK_NORMAL , wake_all ? 0 : 1 , & key );
452
452
}
453
453
454
+ static int __dax_invalidate_mapping_entry (struct address_space * mapping ,
455
+ pgoff_t index , bool trunc )
456
+ {
457
+ int ret = 0 ;
458
+ void * entry ;
459
+ struct radix_tree_root * page_tree = & mapping -> page_tree ;
460
+
461
+ spin_lock_irq (& mapping -> tree_lock );
462
+ entry = get_unlocked_mapping_entry (mapping , index , NULL );
463
+ if (!entry || !radix_tree_exceptional_entry (entry ))
464
+ goto out ;
465
+ if (!trunc &&
466
+ (radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_DIRTY ) ||
467
+ radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_TOWRITE )))
468
+ goto out ;
469
+ radix_tree_delete (page_tree , index );
470
+ mapping -> nrexceptional -- ;
471
+ ret = 1 ;
472
+ out :
473
+ put_unlocked_mapping_entry (mapping , index , entry );
474
+ spin_unlock_irq (& mapping -> tree_lock );
475
+ return ret ;
476
+ }
454
477
/*
455
478
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
456
479
* entry to get unlocked before deleting it.
457
480
*/
458
481
int dax_delete_mapping_entry (struct address_space * mapping , pgoff_t index )
459
482
{
460
- void * entry ;
483
+ int ret = __dax_invalidate_mapping_entry ( mapping , index , true) ;
461
484
462
- spin_lock_irq (& mapping -> tree_lock );
463
- entry = get_unlocked_mapping_entry (mapping , index , NULL );
464
485
/*
465
486
* This gets called from truncate / punch_hole path. As such, the caller
466
487
* must hold locks protecting against concurrent modifications of the
467
488
* radix tree (usually fs-private i_mmap_sem for writing). Since the
468
489
* caller has seen exceptional entry for this index, we better find it
469
490
* at that index as well...
470
491
*/
471
- if (WARN_ON_ONCE (!entry || !radix_tree_exceptional_entry (entry ))) {
472
- spin_unlock_irq (& mapping -> tree_lock );
473
- return 0 ;
474
- }
475
- radix_tree_delete (& mapping -> page_tree , index );
492
+ WARN_ON_ONCE (!ret );
493
+ return ret ;
494
+ }
495
+
496
+ /*
497
+ * Invalidate exceptional DAX entry if easily possible. This handles DAX
498
+ * entries for invalidate_inode_pages() so we evict the entry only if we can
499
+ * do so without blocking.
500
+ */
501
+ int dax_invalidate_mapping_entry (struct address_space * mapping , pgoff_t index )
502
+ {
503
+ int ret = 0 ;
504
+ void * entry , * * slot ;
505
+ struct radix_tree_root * page_tree = & mapping -> page_tree ;
506
+
507
+ spin_lock_irq (& mapping -> tree_lock );
508
+ entry = __radix_tree_lookup (page_tree , index , NULL , & slot );
509
+ if (!entry || !radix_tree_exceptional_entry (entry ) ||
510
+ slot_locked (mapping , slot ))
511
+ goto out ;
512
+ if (radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_DIRTY ) ||
513
+ radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_TOWRITE ))
514
+ goto out ;
515
+ radix_tree_delete (page_tree , index );
476
516
mapping -> nrexceptional -- ;
517
+ ret = 1 ;
518
+ out :
477
519
spin_unlock_irq (& mapping -> tree_lock );
478
- dax_wake_mapping_entry_waiter (mapping , index , entry , true);
520
+ if (ret )
521
+ dax_wake_mapping_entry_waiter (mapping , index , entry , true);
522
+ return ret ;
523
+ }
479
524
480
- return 1 ;
525
+ /*
526
+ * Invalidate exceptional DAX entry if it is clean.
527
+ */
528
+ int dax_invalidate_mapping_entry_sync (struct address_space * mapping ,
529
+ pgoff_t index )
530
+ {
531
+ return __dax_invalidate_mapping_entry (mapping , index , false);
481
532
}
482
533
483
534
/*
@@ -488,24 +539,34 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
488
539
* otherwise it will simply fall out of the page cache under memory
489
540
* pressure without ever having been dirtied.
490
541
*/
491
- static int dax_load_hole (struct address_space * mapping , void * entry ,
542
+ static int dax_load_hole (struct address_space * mapping , void * * entry ,
492
543
struct vm_fault * vmf )
493
544
{
494
545
struct page * page ;
546
+ int ret ;
495
547
496
548
/* Hole page already exists? Return it... */
497
- if (!radix_tree_exceptional_entry (entry )) {
498
- vmf -> page = entry ;
499
- return VM_FAULT_LOCKED ;
549
+ if (!radix_tree_exceptional_entry (* entry )) {
550
+ page = * entry ;
551
+ goto out ;
500
552
}
501
553
502
554
/* This will replace locked radix tree entry with a hole page */
503
555
page = find_or_create_page (mapping , vmf -> pgoff ,
504
556
vmf -> gfp_mask | __GFP_ZERO );
505
557
if (!page )
506
558
return VM_FAULT_OOM ;
559
+ out :
507
560
vmf -> page = page ;
508
- return VM_FAULT_LOCKED ;
561
+ ret = finish_fault (vmf );
562
+ vmf -> page = NULL ;
563
+ * entry = page ;
564
+ if (!ret ) {
565
+ /* Grab reference for PTE that is now referencing the page */
566
+ get_page (page );
567
+ return VM_FAULT_NOPAGE ;
568
+ }
569
+ return ret ;
509
570
}
510
571
511
572
static int copy_user_dax (struct block_device * bdev , sector_t sector , size_t size ,
@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
934
995
if (WARN_ON_ONCE (iomap -> type != IOMAP_MAPPED ))
935
996
return - EIO ;
936
997
998
+ /*
999
+ * Write can allocate block for an area which has a hole page mapped
1000
+ * into page tables. We have to tear down these mappings so that data
1001
+ * written by write(2) is visible in mmap.
1002
+ */
1003
+ if ((iomap -> flags & IOMAP_F_NEW ) && inode -> i_mapping -> nrpages ) {
1004
+ invalidate_inode_pages2_range (inode -> i_mapping ,
1005
+ pos >> PAGE_SHIFT ,
1006
+ (end - 1 ) >> PAGE_SHIFT );
1007
+ }
1008
+
937
1009
while (pos < end ) {
938
1010
unsigned offset = pos & (PAGE_SIZE - 1 );
939
1011
struct blk_dax_ctl dax = { 0 };
@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
992
1064
if (iov_iter_rw (iter ) == WRITE )
993
1065
flags |= IOMAP_WRITE ;
994
1066
995
- /*
996
- * Yes, even DAX files can have page cache attached to them: A zeroed
997
- * page is inserted into the pagecache when we have to serve a write
998
- * fault on a hole. It should never be dirtied and can simply be
999
- * dropped from the pagecache once we get real data for the page.
1000
- *
1001
- * XXX: This is racy against mmap, and there's nothing we can do about
1002
- * it. We'll eventually need to shift this down even further so that
1003
- * we can check if we allocated blocks over a hole first.
1004
- */
1005
- if (mapping -> nrpages ) {
1006
- ret = invalidate_inode_pages2_range (mapping ,
1007
- pos >> PAGE_SHIFT ,
1008
- (pos + iov_iter_count (iter ) - 1 ) >> PAGE_SHIFT );
1009
- WARN_ON_ONCE (ret );
1010
- }
1011
-
1012
1067
while (iov_iter_count (iter )) {
1013
1068
ret = iomap_apply (inode , pos , iov_iter_count (iter ), flags , ops ,
1014
1069
iter , dax_iomap_actor );
@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1023
1078
}
1024
1079
EXPORT_SYMBOL_GPL (dax_iomap_rw );
1025
1080
1081
+ static int dax_fault_return (int error )
1082
+ {
1083
+ if (error == 0 )
1084
+ return VM_FAULT_NOPAGE ;
1085
+ if (error == - ENOMEM )
1086
+ return VM_FAULT_OOM ;
1087
+ return VM_FAULT_SIGBUS ;
1088
+ }
1089
+
1026
1090
/**
1027
1091
* dax_iomap_fault - handle a page fault on a DAX file
1028
1092
* @vma: The virtual memory area where the fault occurred
@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1055
1119
if (pos >= i_size_read (inode ))
1056
1120
return VM_FAULT_SIGBUS ;
1057
1121
1058
- entry = grab_mapping_entry (mapping , vmf -> pgoff , 0 );
1059
- if (IS_ERR (entry )) {
1060
- error = PTR_ERR (entry );
1061
- goto out ;
1062
- }
1063
-
1064
1122
if ((vmf -> flags & FAULT_FLAG_WRITE ) && !vmf -> cow_page )
1065
1123
flags |= IOMAP_WRITE ;
1066
1124
@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1071
1129
*/
1072
1130
error = ops -> iomap_begin (inode , pos , PAGE_SIZE , flags , & iomap );
1073
1131
if (error )
1074
- goto unlock_entry ;
1132
+ return dax_fault_return ( error ) ;
1075
1133
if (WARN_ON_ONCE (iomap .offset + iomap .length < pos + PAGE_SIZE )) {
1076
- error = - EIO ; /* fs corruption? */
1134
+ vmf_ret = dax_fault_return (- EIO ); /* fs corruption? */
1135
+ goto finish_iomap ;
1136
+ }
1137
+
1138
+ entry = grab_mapping_entry (mapping , vmf -> pgoff , 0 );
1139
+ if (IS_ERR (entry )) {
1140
+ vmf_ret = dax_fault_return (PTR_ERR (entry ));
1077
1141
goto finish_iomap ;
1078
1142
}
1079
1143
@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1096
1160
}
1097
1161
1098
1162
if (error )
1099
- goto finish_iomap ;
1163
+ goto error_unlock_entry ;
1100
1164
1101
1165
__SetPageUptodate (vmf -> cow_page );
1102
1166
vmf_ret = finish_fault (vmf );
1103
1167
if (!vmf_ret )
1104
1168
vmf_ret = VM_FAULT_DONE_COW ;
1105
- goto finish_iomap ;
1169
+ goto unlock_entry ;
1106
1170
}
1107
1171
1108
1172
switch (iomap .type ) {
@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1114
1178
}
1115
1179
error = dax_insert_mapping (mapping , iomap .bdev , sector ,
1116
1180
PAGE_SIZE , & entry , vma , vmf );
1181
+ /* -EBUSY is fine, somebody else faulted on the same PTE */
1182
+ if (error == - EBUSY )
1183
+ error = 0 ;
1117
1184
break ;
1118
1185
case IOMAP_UNWRITTEN :
1119
1186
case IOMAP_HOLE :
1120
1187
if (!(vmf -> flags & FAULT_FLAG_WRITE )) {
1121
- vmf_ret = dax_load_hole (mapping , entry , vmf );
1122
- break ;
1188
+ vmf_ret = dax_load_hole (mapping , & entry , vmf );
1189
+ goto unlock_entry ;
1123
1190
}
1124
1191
/*FALLTHRU*/
1125
1192
default :
@@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1128
1195
break ;
1129
1196
}
1130
1197
1198
+ error_unlock_entry :
1199
+ vmf_ret = dax_fault_return (error ) | major ;
1200
+ unlock_entry :
1201
+ put_locked_mapping_entry (mapping , vmf -> pgoff , entry );
1131
1202
finish_iomap :
1132
1203
if (ops -> iomap_end ) {
1133
- if (error || (vmf_ret & VM_FAULT_ERROR )) {
1134
- /* keep previous error */
1135
- ops -> iomap_end (inode , pos , PAGE_SIZE , 0 , flags ,
1136
- & iomap );
1137
- } else {
1138
- error = ops -> iomap_end (inode , pos , PAGE_SIZE ,
1139
- PAGE_SIZE , flags , & iomap );
1140
- }
1141
- }
1142
- unlock_entry :
1143
- if (vmf_ret != VM_FAULT_LOCKED || error )
1144
- put_locked_mapping_entry (mapping , vmf -> pgoff , entry );
1145
- out :
1146
- if (error == - ENOMEM )
1147
- return VM_FAULT_OOM | major ;
1148
- /* -EBUSY is fine, somebody else faulted on the same PTE */
1149
- if (error < 0 && error != - EBUSY )
1150
- return VM_FAULT_SIGBUS | major ;
1151
- if (vmf_ret ) {
1152
- WARN_ON_ONCE (error ); /* -EBUSY from ops->iomap_end? */
1153
- return vmf_ret ;
1204
+ int copied = PAGE_SIZE ;
1205
+
1206
+ if (vmf_ret & VM_FAULT_ERROR )
1207
+ copied = 0 ;
1208
+ /*
1209
+ * The fault is done by now and there's no way back (other
1210
+ * thread may be already happily using PTE we have installed).
1211
+ * Just ignore error from ->iomap_end since we cannot do much
1212
+ * with it.
1213
+ */
1214
+ ops -> iomap_end (inode , pos , PAGE_SIZE , copied , flags , & iomap );
1154
1215
}
1155
- return VM_FAULT_NOPAGE | major ;
1216
+ return vmf_ret ;
1156
1217
}
1157
1218
EXPORT_SYMBOL_GPL (dax_iomap_fault );
1158
1219
@@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1276
1337
if ((pgoff | PG_PMD_COLOUR ) > max_pgoff )
1277
1338
goto fallback ;
1278
1339
1279
- /*
1280
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1281
- * PMD or a HZP entry. If it can't (because a 4k page is already in
1282
- * the tree, for instance), it will return -EEXIST and we just fall
1283
- * back to 4k entries.
1284
- */
1285
- entry = grab_mapping_entry (mapping , pgoff , RADIX_DAX_PMD );
1286
- if (IS_ERR (entry ))
1287
- goto fallback ;
1288
-
1289
1340
/*
1290
1341
* Note that we don't use iomap_apply here. We aren't doing I/O, only
1291
1342
* setting up a mapping, so really we're using iomap_begin() as a way
@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1294
1345
pos = (loff_t )pgoff << PAGE_SHIFT ;
1295
1346
error = ops -> iomap_begin (inode , pos , PMD_SIZE , iomap_flags , & iomap );
1296
1347
if (error )
1297
- goto unlock_entry ;
1348
+ goto fallback ;
1349
+
1298
1350
if (iomap .offset + iomap .length < pos + PMD_SIZE )
1299
1351
goto finish_iomap ;
1300
1352
1353
+ /*
1354
+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1355
+ * PMD or a HZP entry. If it can't (because a 4k page is already in
1356
+ * the tree, for instance), it will return -EEXIST and we just fall
1357
+ * back to 4k entries.
1358
+ */
1359
+ entry = grab_mapping_entry (mapping , pgoff , RADIX_DAX_PMD );
1360
+ if (IS_ERR (entry ))
1361
+ goto finish_iomap ;
1362
+
1301
1363
vmf .pgoff = pgoff ;
1302
1364
vmf .flags = flags ;
1303
1365
vmf .gfp_mask = mapping_gfp_mask (mapping ) | __GFP_IO ;
@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1310
1372
case IOMAP_UNWRITTEN :
1311
1373
case IOMAP_HOLE :
1312
1374
if (WARN_ON_ONCE (write ))
1313
- goto finish_iomap ;
1375
+ goto unlock_entry ;
1314
1376
result = dax_pmd_load_hole (vma , pmd , & vmf , address , & iomap ,
1315
1377
& entry );
1316
1378
break ;
@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1319
1381
break ;
1320
1382
}
1321
1383
1384
+ unlock_entry :
1385
+ put_locked_mapping_entry (mapping , pgoff , entry );
1322
1386
finish_iomap :
1323
1387
if (ops -> iomap_end ) {
1324
- if (result == VM_FAULT_FALLBACK ) {
1325
- ops -> iomap_end (inode , pos , PMD_SIZE , 0 , iomap_flags ,
1326
- & iomap );
1327
- } else {
1328
- error = ops -> iomap_end (inode , pos , PMD_SIZE , PMD_SIZE ,
1329
- iomap_flags , & iomap );
1330
- if (error )
1331
- result = VM_FAULT_FALLBACK ;
1332
- }
1388
+ int copied = PMD_SIZE ;
1389
+
1390
+ if (result == VM_FAULT_FALLBACK )
1391
+ copied = 0 ;
1392
+ /*
1393
+ * The fault is done by now and there's no way back (other
1394
+ * thread may be already happily using PMD we have installed).
1395
+ * Just ignore error from ->iomap_end since we cannot do much
1396
+ * with it.
1397
+ */
1398
+ ops -> iomap_end (inode , pos , PMD_SIZE , copied , iomap_flags ,
1399
+ & iomap );
1333
1400
}
1334
- unlock_entry :
1335
- put_locked_mapping_entry (mapping , pgoff , entry );
1336
1401
fallback :
1337
1402
if (result == VM_FAULT_FALLBACK ) {
1338
1403
split_huge_pmd (vma , pmd , address );
0 commit comments