Skip to content

Commit 8b62b72

Browse files
committed
Btrfs: Use PagePrivate2 to track pages in the data=ordered code.
Btrfs writes go through delalloc to the data=ordered code. This makes sure that all of the data is on disk before the metadata that references it. The tracking means that we have to make sure each page in an extent is fully written before we add that extent into the on-disk btree. This was done in the past by setting the EXTENT_ORDERED bit for the range of an extent when it was added to the data=ordered code, and then clearing the EXTENT_ORDERED bit in the extent state tree as each page finished IO. One of the reasons we had to do this was because sometimes pages are magically dirtied without page_mkwrite being called. The EXTENT_ORDERED bit is checked at writepage time, and if it isn't there, our page become dirty without going through the proper path. These bit operations make for a number of rbtree searches for each page, and can cause considerable lock contention. This commit switches from the EXTENT_ORDERED bit to use PagePrivate2. As pages go into the ordered code, PagePrivate2 is set on each one. This is a cheap operation because we already have all the pages locked and ready to go. As IO finishes, the PagePrivate2 bit is cleared and the ordered accoutning is updated for each page. At writepage time, if the PagePrivate2 bit is missing, we go into the writepage fixup code to handle improperly dirtied pages. Signed-off-by: Chris Mason <[email protected]>
1 parent 9655d29 commit 8b62b72

File tree

5 files changed

+62
-55
lines changed

5 files changed

+62
-55
lines changed

fs/btrfs/extent_io.c

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -885,13 +885,6 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
885885
NULL, mask);
886886
}
887887

888-
int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
889-
gfp_t mask)
890-
{
891-
return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, NULL,
892-
mask);
893-
}
894-
895888
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
896889
int bits, gfp_t mask)
897890
{
@@ -921,13 +914,6 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
921914
NULL, mask);
922915
}
923916

924-
int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
925-
gfp_t mask)
926-
{
927-
return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0,
928-
NULL, mask);
929-
}
930-
931917
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
932918
gfp_t mask)
933919
{
@@ -1373,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
13731359
int clear_unlock,
13741360
int clear_delalloc, int clear_dirty,
13751361
int set_writeback,
1376-
int end_writeback)
1362+
int end_writeback,
1363+
int set_private2)
13771364
{
13781365
int ret;
13791366
struct page *pages[16];
@@ -1392,14 +1379,19 @@ int extent_clear_unlock_delalloc(struct inode *inode,
13921379
clear_bits |= EXTENT_DELALLOC;
13931380

13941381
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1395-
if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1382+
if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
1383+
set_private2))
13961384
return 0;
13971385

13981386
while (nr_pages > 0) {
13991387
ret = find_get_pages_contig(inode->i_mapping, index,
14001388
min_t(unsigned long,
14011389
nr_pages, ARRAY_SIZE(pages)), pages);
14021390
for (i = 0; i < ret; i++) {
1391+
1392+
if (set_private2)
1393+
SetPagePrivate2(pages[i]);
1394+
14031395
if (pages[i] == locked_page) {
14041396
page_cache_release(pages[i]);
14051397
continue;
@@ -2792,7 +2784,7 @@ int try_release_extent_state(struct extent_map_tree *map,
27922784
int ret = 1;
27932785

27942786
if (test_range_bit(tree, start, end,
2795-
EXTENT_IOBITS | EXTENT_ORDERED, 0, NULL))
2787+
EXTENT_IOBITS, 0, NULL))
27962788
ret = 0;
27972789
else {
27982790
if ((mask & GFP_NOFS) == GFP_NOFS)
@@ -2835,8 +2827,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
28352827
}
28362828
if (!test_range_bit(tree, em->start,
28372829
extent_map_end(em) - 1,
2838-
EXTENT_LOCKED | EXTENT_WRITEBACK |
2839-
EXTENT_ORDERED,
2830+
EXTENT_LOCKED | EXTENT_WRITEBACK,
28402831
0, NULL)) {
28412832
remove_extent_mapping(map, em);
28422833
/* once for the rb tree */

fs/btrfs/extent_io.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,8 @@
1313
#define EXTENT_DEFRAG (1 << 6)
1414
#define EXTENT_DEFRAG_DONE (1 << 7)
1515
#define EXTENT_BUFFER_FILLED (1 << 8)
16-
#define EXTENT_ORDERED (1 << 9)
17-
#define EXTENT_ORDERED_METADATA (1 << 10)
18-
#define EXTENT_BOUNDARY (1 << 11)
19-
#define EXTENT_NODATASUM (1 << 12)
16+
#define EXTENT_BOUNDARY (1 << 9)
17+
#define EXTENT_NODATASUM (1 << 10)
2018
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
2119

2220
/* flags for bio submission */
@@ -285,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
285283
int clear_unlock,
286284
int clear_delalloc, int clear_dirty,
287285
int set_writeback,
288-
int end_writeback);
286+
int end_writeback,
287+
int set_private2);
289288
#endif

fs/btrfs/inode.c

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ static noinline int compress_file_range(struct inode *inode,
426426
extent_clear_unlock_delalloc(inode,
427427
&BTRFS_I(inode)->io_tree,
428428
start, end, NULL, 1, 0,
429-
0, 1, 1, 1);
429+
0, 1, 1, 1, 0);
430430
ret = 0;
431431
goto free_pages_out;
432432
}
@@ -641,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
641641
async_extent->start,
642642
async_extent->start +
643643
async_extent->ram_size - 1,
644-
NULL, 1, 1, 0, 1, 1, 0);
644+
NULL, 1, 1, 0, 1, 1, 0, 0);
645645

646646
ret = btrfs_submit_compressed_write(inode,
647647
async_extent->start,
@@ -714,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
714714
extent_clear_unlock_delalloc(inode,
715715
&BTRFS_I(inode)->io_tree,
716716
start, end, NULL, 1, 1,
717-
1, 1, 1, 1);
717+
1, 1, 1, 1, 0);
718718
*nr_written = *nr_written +
719719
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
720720
*page_started = 1;
@@ -777,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode,
777777
/* we're not doing compressed IO, don't unlock the first
778778
* page (which the caller expects to stay locked), don't
779779
* clear any dirty bits and don't set any writeback bits
780+
*
781+
* Do set the Private2 bit so we know this page was properly
782+
* setup for writepage
780783
*/
781784
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
782785
start, start + ram_size - 1,
783786
locked_page, unlock, 1,
784-
1, 0, 0, 0);
787+
1, 0, 0, 0, 1);
785788
disk_num_bytes -= cur_alloc_size;
786789
num_bytes -= cur_alloc_size;
787790
alloc_hint = ins.objectid + ins.offset;
@@ -1102,7 +1105,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
11021105

11031106
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
11041107
cur_offset, cur_offset + num_bytes - 1,
1105-
locked_page, 1, 1, 1, 0, 0, 0);
1108+
locked_page, 1, 1, 1, 0, 0, 0, 1);
11061109
cur_offset = extent_end;
11071110
if (cur_offset > end)
11081111
break;
@@ -1375,10 +1378,8 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
13751378
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
13761379

13771380
/* already ordered? We're done */
1378-
if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1379-
EXTENT_ORDERED, 0, NULL)) {
1381+
if (PagePrivate2(page))
13801382
goto out;
1381-
}
13821383

13831384
ordered = btrfs_lookup_ordered_extent(inode, page_start);
13841385
if (ordered) {
@@ -1414,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
14141415
struct inode *inode = page->mapping->host;
14151416
struct btrfs_writepage_fixup *fixup;
14161417
struct btrfs_root *root = BTRFS_I(inode)->root;
1417-
int ret;
14181418

1419-
ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1420-
EXTENT_ORDERED, 0, NULL);
1421-
if (ret)
1419+
/* this page is properly in the ordered list */
1420+
if (TestClearPagePrivate2(page))
14221421
return 0;
14231422

14241423
if (PageChecked(page))
@@ -1624,6 +1623,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
16241623
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
16251624
struct extent_state *state, int uptodate)
16261625
{
1626+
ClearPagePrivate2(page);
16271627
return btrfs_finish_ordered_io(page->mapping->host, start, end);
16281628
}
16291629

@@ -4403,13 +4403,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
44034403
u64 page_start = page_offset(page);
44044404
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
44054405

4406+
4407+
/*
4408+
* we have the page locked, so new writeback can't start,
4409+
* and the dirty bit won't be cleared while we are here.
4410+
*
4411+
* Wait for IO on this page so that we can safely clear
4412+
* the PagePrivate2 bit and do ordered accounting
4413+
*/
44064414
wait_on_page_writeback(page);
4415+
44074416
tree = &BTRFS_I(page->mapping->host)->io_tree;
44084417
if (offset) {
44094418
btrfs_releasepage(page, GFP_NOFS);
44104419
return;
44114420
}
4412-
44134421
lock_extent(tree, page_start, page_end, GFP_NOFS);
44144422
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
44154423
page_offset(page));
@@ -4421,14 +4429,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
44214429
clear_extent_bit(tree, page_start, page_end,
44224430
EXTENT_DIRTY | EXTENT_DELALLOC |
44234431
EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
4424-
btrfs_finish_ordered_io(page->mapping->host,
4425-
page_start, page_end);
4432+
/*
4433+
* whoever cleared the private bit is responsible
4434+
* for the finish_ordered_io
4435+
*/
4436+
if (TestClearPagePrivate2(page)) {
4437+
btrfs_finish_ordered_io(page->mapping->host,
4438+
page_start, page_end);
4439+
}
44264440
btrfs_put_ordered_extent(ordered);
44274441
lock_extent(tree, page_start, page_end, GFP_NOFS);
44284442
}
44294443
clear_extent_bit(tree, page_start, page_end,
4430-
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4431-
EXTENT_ORDERED,
4444+
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
44324445
1, 1, NULL, GFP_NOFS);
44334446
__btrfs_releasepage(page, GFP_NOFS);
44344447

fs/btrfs/ordered-data.c

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
159159
*
160160
* len is the length of the extent
161161
*
162-
* This also sets the EXTENT_ORDERED bit on the range in the inode.
163-
*
164162
* The tree is given a single reference on the ordered extent that was
165163
* inserted.
166164
*/
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
181179
entry->start = start;
182180
entry->len = len;
183181
entry->disk_len = disk_len;
182+
entry->bytes_left = len;
184183
entry->inode = inode;
185184
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186185
set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
195194
&entry->rb_node);
196195
BUG_ON(node);
197196

198-
set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199-
entry_end(entry) - 1, GFP_NOFS);
200-
201197
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202198
list_add_tail(&entry->root_extent_list,
203199
&BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
241237
struct btrfs_ordered_inode_tree *tree;
242238
struct rb_node *node;
243239
struct btrfs_ordered_extent *entry;
244-
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245240
int ret;
246241

247242
tree = &BTRFS_I(inode)->ordered_tree;
248243
mutex_lock(&tree->mutex);
249-
clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250-
GFP_NOFS);
251244
node = tree_search(tree, file_offset);
252245
if (!node) {
253246
ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
260253
goto out;
261254
}
262255

263-
ret = test_range_bit(io_tree, entry->file_offset,
264-
entry->file_offset + entry->len - 1,
265-
EXTENT_ORDERED, 0, NULL);
266-
if (ret == 0)
256+
if (io_size > entry->bytes_left) {
257+
printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
258+
(unsigned long long)entry->bytes_left,
259+
(unsigned long long)io_size);
260+
}
261+
entry->bytes_left -= io_size;
262+
if (entry->bytes_left == 0)
267263
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
264+
else
265+
ret = 1;
268266
out:
269267
mutex_unlock(&tree->mutex);
270268
return ret == 0;
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
476474
u64 orig_end;
477475
u64 wait_end;
478476
struct btrfs_ordered_extent *ordered;
477+
int found;
479478

480479
if (start + len < start) {
481480
orig_end = INT_LIMIT(loff_t);
@@ -502,6 +501,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
502501
orig_end >> PAGE_CACHE_SHIFT);
503502

504503
end = orig_end;
504+
found = 0;
505505
while (1) {
506506
ordered = btrfs_lookup_first_ordered_extent(inode, end);
507507
if (!ordered)
@@ -514,15 +514,16 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
514514
btrfs_put_ordered_extent(ordered);
515515
break;
516516
}
517+
found++;
517518
btrfs_start_ordered_extent(inode, ordered, 1);
518519
end = ordered->file_offset;
519520
btrfs_put_ordered_extent(ordered);
520521
if (end == 0 || end == start)
521522
break;
522523
end--;
523524
}
524-
if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
525-
EXTENT_ORDERED | EXTENT_DELALLOC, 0, NULL)) {
525+
if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
526+
EXTENT_DELALLOC, 0, NULL)) {
526527
schedule_timeout(1);
527528
goto again;
528529
}

fs/btrfs/ordered-data.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
8585
/* extent length on disk */
8686
u64 disk_len;
8787

88+
/* number of bytes that still need writing */
89+
u64 bytes_left;
90+
8891
/* flags (described above) */
8992
unsigned long flags;
9093

0 commit comments

Comments
 (0)