Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 786bb02

Browse files
Panky-codesaxboe
authored andcommitted
brd: use XArray instead of radix-tree to index backing pages
XArray was introduced to hold large array of pointers with a simple API. XArray API also provides array semantics which simplifies the way we store and access the backing pages, and the code becomes significantly easier to understand. No performance difference was noticed between the two implementation using fio with direct=1 [1]. [1] Performance in KIOPS: | radix-tree | XArray | Diff | | | write | 315 | 313 | -0.6% randwrite | 286 | 290 | +1.3% read | 330 | 335 | +1.5% randread | 309 | 312 | +0.9% Signed-off-by: Pankaj Raghav <[email protected]> Reviewed-by: Hannes Reinecke <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent f1fcbaa commit 786bb02

File tree

1 file changed

+24
-69
lines changed

1 file changed

+24
-69
lines changed

drivers/block/brd.c

Lines changed: 24 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include <linux/highmem.h>
2020
#include <linux/mutex.h>
2121
#include <linux/pagemap.h>
22-
#include <linux/radix-tree.h>
22+
#include <linux/xarray.h>
2323
#include <linux/fs.h>
2424
#include <linux/slab.h>
2525
#include <linux/backing-dev.h>
@@ -28,7 +28,7 @@
2828
#include <linux/uaccess.h>
2929

3030
/*
31-
* Each block ramdisk device has a radix_tree brd_pages of pages that stores
31+
* Each block ramdisk device has a xarray brd_pages of pages that stores
3232
* the pages containing the block device's contents. A brd page's ->index is
3333
* its offset in PAGE_SIZE units. This is similar to, but in no way connected
3434
* with, the kernel's pagecache or buffer cache (which sit above our block
@@ -40,11 +40,9 @@ struct brd_device {
4040
struct list_head brd_list;
4141

4242
/*
43-
* Backing store of pages and lock to protect it. This is the contents
44-
* of the block device.
43+
* Backing store of pages. This is the contents of the block device.
4544
*/
46-
spinlock_t brd_lock;
47-
struct radix_tree_root brd_pages;
45+
struct xarray brd_pages;
4846
u64 brd_nr_pages;
4947
};
5048

@@ -56,21 +54,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
5654
pgoff_t idx;
5755
struct page *page;
5856

59-
/*
60-
* The page lifetime is protected by the fact that we have opened the
61-
* device node -- brd pages will never be deleted under us, so we
62-
* don't need any further locking or refcounting.
63-
*
64-
* This is strictly true for the radix-tree nodes as well (ie. we
65-
* don't actually need the rcu_read_lock()), however that is not a
66-
* documented feature of the radix-tree API so it is better to be
67-
* safe here (we don't have total exclusion from radix tree updates
68-
* here, only deletes).
69-
*/
70-
rcu_read_lock();
7157
idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
72-
page = radix_tree_lookup(&brd->brd_pages, idx);
73-
rcu_read_unlock();
58+
page = xa_load(&brd->brd_pages, idx);
7459

7560
BUG_ON(page && page->index != idx);
7661

@@ -83,7 +68,7 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
8368
static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
8469
{
8570
pgoff_t idx;
86-
struct page *page;
71+
struct page *page, *cur;
8772
int ret = 0;
8873

8974
page = brd_lookup_page(brd, sector);
@@ -94,71 +79,42 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
9479
if (!page)
9580
return -ENOMEM;
9681

97-
if (radix_tree_maybe_preload(gfp)) {
98-
__free_page(page);
99-
return -ENOMEM;
100-
}
82+
xa_lock(&brd->brd_pages);
10183

102-
spin_lock(&brd->brd_lock);
10384
idx = sector >> PAGE_SECTORS_SHIFT;
10485
page->index = idx;
105-
if (radix_tree_insert(&brd->brd_pages, idx, page)) {
86+
87+
cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
88+
89+
if (unlikely(cur)) {
10690
__free_page(page);
107-
page = radix_tree_lookup(&brd->brd_pages, idx);
108-
if (!page)
109-
ret = -ENOMEM;
110-
else if (page->index != idx)
91+
ret = xa_err(cur);
92+
if (!ret && (cur->index != idx))
11193
ret = -EIO;
11294
} else {
11395
brd->brd_nr_pages++;
11496
}
115-
spin_unlock(&brd->brd_lock);
11697

117-
radix_tree_preload_end();
98+
xa_unlock(&brd->brd_pages);
99+
118100
return ret;
119101
}
120102

121103
/*
122-
* Free all backing store pages and radix tree. This must only be called when
104+
* Free all backing store pages and xarray. This must only be called when
123105
* there are no other users of the device.
124106
*/
125-
#define FREE_BATCH 16
126107
static void brd_free_pages(struct brd_device *brd)
127108
{
128-
unsigned long pos = 0;
129-
struct page *pages[FREE_BATCH];
130-
int nr_pages;
131-
132-
do {
133-
int i;
134-
135-
nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
136-
(void **)pages, pos, FREE_BATCH);
137-
138-
for (i = 0; i < nr_pages; i++) {
139-
void *ret;
140-
141-
BUG_ON(pages[i]->index < pos);
142-
pos = pages[i]->index;
143-
ret = radix_tree_delete(&brd->brd_pages, pos);
144-
BUG_ON(!ret || ret != pages[i]);
145-
__free_page(pages[i]);
146-
}
147-
148-
pos++;
109+
struct page *page;
110+
pgoff_t idx;
149111

150-
/*
151-
* It takes 3.4 seconds to remove 80GiB ramdisk.
152-
* So, we need cond_resched to avoid stalling the CPU.
153-
*/
154-
cond_resched();
112+
xa_for_each(&brd->brd_pages, idx, page) {
113+
__free_page(page);
114+
cond_resched_rcu();
115+
}
155116

156-
/*
157-
* This assumes radix_tree_gang_lookup always returns as
158-
* many pages as possible. If the radix-tree code changes,
159-
* so will this have to.
160-
*/
161-
} while (nr_pages == FREE_BATCH);
117+
xa_destroy(&brd->brd_pages);
162118
}
163119

164120
/*
@@ -372,8 +328,7 @@ static int brd_alloc(int i)
372328
brd->brd_number = i;
373329
list_add_tail(&brd->brd_list, &brd_devices);
374330

375-
spin_lock_init(&brd->brd_lock);
376-
INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
331+
xa_init(&brd->brd_pages);
377332

378333
snprintf(buf, DISK_NAME_LEN, "ram%d", i);
379334
if (!IS_ERR_OR_NULL(brd_debugfs_dir))

0 commit comments

Comments
 (0)