Skip to content

Commit d475c63

Browse files
Matthew Wilcoxtorvalds
authored andcommitted
dax,ext2: replace XIP read and write with DAX I/O
Use the generic AIO infrastructure instead of custom read and write methods. In addition to giving us support for AIO, this adds the missing locking between read() and truncate(). Signed-off-by: Matthew Wilcox <[email protected]> Reviewed-by: Ross Zwisler <[email protected]> Reviewed-by: Jan Kara <[email protected]> Cc: Andreas Dilger <[email protected]> Cc: Boaz Harrosh <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Dave Chinner <[email protected]> Cc: Jens Axboe <[email protected]> Cc: Kirill A. Shutemov <[email protected]> Cc: Mathieu Desnoyers <[email protected]> Cc: Randy Dunlap <[email protected]> Cc: Theodore Ts'o <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent fbbbad4 commit d475c63

File tree

8 files changed

+214
-245
lines changed

8 files changed

+214
-245
lines changed

MAINTAINERS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3151,6 +3151,12 @@ L: [email protected]
31513151
S: Maintained
31523152
F: drivers/i2c/busses/i2c-diolan-u2c.c
31533153

3154+
DIRECT ACCESS (DAX)
3155+
M: Matthew Wilcox <[email protected]>
3156+
3157+
S: Supported
3158+
F: fs/dax.c
3159+
31543160
DIRECTORY NOTIFICATION (DNOTIFY)
31553161
M: Eric Paris <[email protected]>
31563162
S: Maintained

fs/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
2828
obj-$(CONFIG_TIMERFD) += timerfd.o
2929
obj-$(CONFIG_EVENTFD) += eventfd.o
3030
obj-$(CONFIG_AIO) += aio.o
31+
obj-$(CONFIG_FS_XIP) += dax.o
3132
obj-$(CONFIG_FILE_LOCKING) += locks.o
3233
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
3334
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o

fs/dax.c

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
/*
2+
* fs/dax.c - Direct Access filesystem code
3+
* Copyright (c) 2013-2014 Intel Corporation
4+
* Author: Matthew Wilcox <[email protected]>
5+
* Author: Ross Zwisler <[email protected]>
6+
*
7+
* This program is free software; you can redistribute it and/or modify it
8+
* under the terms and conditions of the GNU General Public License,
9+
* version 2, as published by the Free Software Foundation.
10+
*
11+
* This program is distributed in the hope it will be useful, but WITHOUT
12+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14+
* more details.
15+
*/
16+
17+
#include <linux/atomic.h>
18+
#include <linux/blkdev.h>
19+
#include <linux/buffer_head.h>
20+
#include <linux/fs.h>
21+
#include <linux/genhd.h>
22+
#include <linux/mutex.h>
23+
#include <linux/uio.h>
24+
25+
static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
26+
{
27+
unsigned long pfn;
28+
sector_t sector = bh->b_blocknr << (blkbits - 9);
29+
return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
30+
}
31+
32+
static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
33+
loff_t end)
34+
{
35+
loff_t final = end - pos + first; /* The final byte of the buffer */
36+
37+
if (first > 0)
38+
memset(addr, 0, first);
39+
if (final < size)
40+
memset(addr + final, 0, size - final);
41+
}
42+
43+
static bool buffer_written(struct buffer_head *bh)
44+
{
45+
return buffer_mapped(bh) && !buffer_unwritten(bh);
46+
}
47+
48+
/*
49+
* When ext4 encounters a hole, it returns without modifying the buffer_head
50+
* which means that we can't trust b_size. To cope with this, we set b_state
51+
* to 0 before calling get_block and, if any bit is set, we know we can trust
52+
* b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
53+
* and would save us time calling get_block repeatedly.
54+
*/
55+
static bool buffer_size_valid(struct buffer_head *bh)
56+
{
57+
return bh->b_state != 0;
58+
}
59+
60+
static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
61+
loff_t start, loff_t end, get_block_t get_block,
62+
struct buffer_head *bh)
63+
{
64+
ssize_t retval = 0;
65+
loff_t pos = start;
66+
loff_t max = start;
67+
loff_t bh_max = start;
68+
void *addr;
69+
bool hole = false;
70+
71+
if (rw != WRITE)
72+
end = min(end, i_size_read(inode));
73+
74+
while (pos < end) {
75+
unsigned len;
76+
if (pos == max) {
77+
unsigned blkbits = inode->i_blkbits;
78+
sector_t block = pos >> blkbits;
79+
unsigned first = pos - (block << blkbits);
80+
long size;
81+
82+
if (pos == bh_max) {
83+
bh->b_size = PAGE_ALIGN(end - pos);
84+
bh->b_state = 0;
85+
retval = get_block(inode, block, bh,
86+
rw == WRITE);
87+
if (retval)
88+
break;
89+
if (!buffer_size_valid(bh))
90+
bh->b_size = 1 << blkbits;
91+
bh_max = pos - first + bh->b_size;
92+
} else {
93+
unsigned done = bh->b_size -
94+
(bh_max - (pos - first));
95+
bh->b_blocknr += done >> blkbits;
96+
bh->b_size -= done;
97+
}
98+
99+
hole = (rw != WRITE) && !buffer_written(bh);
100+
if (hole) {
101+
addr = NULL;
102+
size = bh->b_size - first;
103+
} else {
104+
retval = dax_get_addr(bh, &addr, blkbits);
105+
if (retval < 0)
106+
break;
107+
if (buffer_unwritten(bh) || buffer_new(bh))
108+
dax_new_buf(addr, retval, first, pos,
109+
end);
110+
addr += first;
111+
size = retval - first;
112+
}
113+
max = min(pos + size, end);
114+
}
115+
116+
if (rw == WRITE)
117+
len = copy_from_iter(addr, max - pos, iter);
118+
else if (!hole)
119+
len = copy_to_iter(addr, max - pos, iter);
120+
else
121+
len = iov_iter_zero(max - pos, iter);
122+
123+
if (!len)
124+
break;
125+
126+
pos += len;
127+
addr += len;
128+
}
129+
130+
return (pos == start) ? retval : pos - start;
131+
}
132+
133+
/**
134+
* dax_do_io - Perform I/O to a DAX file
135+
* @rw: READ to read or WRITE to write
136+
* @iocb: The control block for this I/O
137+
* @inode: The file which the I/O is directed at
138+
* @iter: The addresses to do I/O from or to
139+
* @pos: The file offset where the I/O starts
140+
* @get_block: The filesystem method used to translate file offsets to blocks
141+
* @end_io: A filesystem callback for I/O completion
142+
* @flags: See below
143+
*
144+
* This function uses the same locking scheme as do_blockdev_direct_IO:
145+
* If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
146+
* caller for writes. For reads, we take and release the i_mutex ourselves.
147+
* If DIO_LOCKING is not set, the filesystem takes care of its own locking.
148+
* As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
149+
* is in progress.
150+
*/
151+
ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
152+
struct iov_iter *iter, loff_t pos,
153+
get_block_t get_block, dio_iodone_t end_io, int flags)
154+
{
155+
struct buffer_head bh;
156+
ssize_t retval = -EINVAL;
157+
loff_t end = pos + iov_iter_count(iter);
158+
159+
memset(&bh, 0, sizeof(bh));
160+
161+
if ((flags & DIO_LOCKING) && (rw == READ)) {
162+
struct address_space *mapping = inode->i_mapping;
163+
mutex_lock(&inode->i_mutex);
164+
retval = filemap_write_and_wait_range(mapping, pos, end - 1);
165+
if (retval) {
166+
mutex_unlock(&inode->i_mutex);
167+
goto out;
168+
}
169+
}
170+
171+
/* Protects against truncate */
172+
atomic_inc(&inode->i_dio_count);
173+
174+
retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
175+
176+
if ((flags & DIO_LOCKING) && (rw == READ))
177+
mutex_unlock(&inode->i_mutex);
178+
179+
if ((retval > 0) && end_io)
180+
end_io(iocb, pos, retval, bh.b_private);
181+
182+
inode_dio_done(inode);
183+
out:
184+
return retval;
185+
}
186+
EXPORT_SYMBOL_GPL(dax_do_io);

fs/ext2/file.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ const struct file_operations ext2_file_operations = {
8181
#ifdef CONFIG_EXT2_FS_XIP
8282
const struct file_operations ext2_xip_file_operations = {
8383
.llseek = generic_file_llseek,
84-
.read = xip_file_read,
85-
.write = xip_file_write,
84+
.read = new_sync_read,
85+
.write = new_sync_write,
86+
.read_iter = generic_file_read_iter,
87+
.write_iter = generic_file_write_iter,
8688
.unlocked_ioctl = ext2_ioctl,
8789
#ifdef CONFIG_COMPAT
8890
.compat_ioctl = ext2_compat_ioctl,

fs/ext2/inode.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
859859
size_t count = iov_iter_count(iter);
860860
ssize_t ret;
861861

862-
ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
862+
if (IS_DAX(inode))
863+
ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
864+
NULL, DIO_LOCKING);
865+
else
866+
ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
867+
ext2_get_block);
863868
if (ret < 0 && (rw & WRITE))
864869
ext2_write_failed(mapping, offset + count);
865870
return ret;
@@ -888,6 +893,7 @@ const struct address_space_operations ext2_aops = {
888893
const struct address_space_operations ext2_aops_xip = {
889894
.bmap = ext2_bmap,
890895
.get_xip_mem = ext2_get_xip_mem,
896+
.direct_IO = ext2_direct_IO,
891897
};
892898

893899
const struct address_space_operations ext2_nobh_aops = {

include/linux/fs.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2587,12 +2587,11 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
25872587
extern int generic_file_open(struct inode * inode, struct file * filp);
25882588
extern int nonseekable_open(struct inode * inode, struct file * filp);
25892589

2590+
ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
2591+
loff_t, get_block_t, dio_iodone_t, int flags);
2592+
25902593
#ifdef CONFIG_FS_XIP
2591-
extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
2592-
loff_t *ppos);
25932594
extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
2594-
extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
2595-
size_t len, loff_t *ppos);
25962595
extern int xip_truncate_page(struct address_space *mapping, loff_t from);
25972596
#else
25982597
static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
@@ -2756,6 +2755,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
27562755
extern void save_mount_options(struct super_block *sb, char *options);
27572756
extern void replace_mount_options(struct super_block *sb, char *options);
27582757

2758+
static inline bool io_is_direct(struct file *filp)
2759+
{
2760+
return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
2761+
}
2762+
27592763
static inline ino_t parent_ino(struct dentry *dentry)
27602764
{
27612765
ino_t res;

mm/filemap.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
16951695
loff_t *ppos = &iocb->ki_pos;
16961696
loff_t pos = *ppos;
16971697

1698-
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1699-
if (file->f_flags & O_DIRECT) {
1698+
if (io_is_direct(file)) {
17001699
struct address_space *mapping = file->f_mapping;
17011700
struct inode *inode = mapping->host;
17021701
size_t count = iov_iter_count(iter);
@@ -2584,8 +2583,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
25842583
if (err)
25852584
goto out;
25862585

2587-
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2588-
if (unlikely(file->f_flags & O_DIRECT)) {
2586+
if (io_is_direct(file)) {
25892587
loff_t endbyte;
25902588

25912589
written = generic_file_direct_write(iocb, from, pos);

0 commit comments

Comments
 (0)